diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 929952e..d700a7f 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -78,7 +78,7 @@ The accessor discovers the `MultiBandStacBackendArray` from the DataArray's lazy - `summary()` — multi-line human-readable report with COG-read distribution histogram - `to_dataframe()` — one row per `(chunk, COG file)` for further analysis in pandas -With `fetch_headers=True`, each matched COG header is fetched (a small HTTP range request to read the IFD block) and `CogRead.overview_level`, `window_col_off`, `window_row_off`, `window_width`, and `window_height` are populated. +With `fetch_headers=True`, each matched COG header is fetched (a small HTTP range request to read the IFD block) and `CogRead.overview_level`, `window_col_off`, `window_row_off`, `window_width`, and `window_height` are populated. This path shares `_chunk_reader._open_and_window()` with real reads through a lightweight window context, but does not construct the full read context or resolve dtype/nodata pixel contracts. ## Phase 1 in detail @@ -110,7 +110,7 @@ A `rasterix.RasterIndex` is attached to every DataArray returned by `open()`. Th ## Per-chunk read and resample pipeline -Each call to `_read_item_band()` in `_chunk_reader.py` first validates the source asset against the inferred output contract, then follows a four-step pipeline to turn a remote COG into a correctly-sized, correctly-projected numpy array for one destination chunk. +Each call to `_read_item_band()` in `_chunk_reader.py` first validates the source asset against the inferred output contract, then follows a four-step pipeline to turn a remote COG into a correctly-sized, correctly-projected numpy array for one destination chunk. Header/window helpers consume the smaller `_WindowContext`; `_ChunkContext` extends it with read-only fields such as nodata, output dtype, explicitness flags, and the warp cache. ### 1. Overview selection diff --git a/src/lazycogs/_chunk_reader.py b/src/lazycogs/_chunk_reader.py index 1bfb86c..4993940 100644 --- a/src/lazycogs/_chunk_reader.py +++ b/src/lazycogs/_chunk_reader.py @@ -33,24 +33,35 @@ @dataclass(frozen=True) -class _ChunkContext: - """Immutable per-chunk parameters shared across all item reads. +class _WindowContext: + """Immutable parameters needed to open a COG and compute a read window. - Built once per chunk in async_mosaic_chunk and passed through to all - internal helpers. Frozen to prevent accidental mutation across concurrent - coroutines. + This header/window-only context is shared by dry-run explain and real chunk + reads. It intentionally excludes pixel-read contract fields such as dtype, + nodata, and warp caches. """ chunk_affine: Affine dst_crs: CRS chunk_width: int chunk_height: int + store: Store | None + path_fn: Callable[[str], str] | None + + +@dataclass(frozen=True) +class _ChunkContext(_WindowContext): + """Immutable per-chunk parameters shared across all item reads. + + Built once per chunk in async_mosaic_chunk and passed through to all + internal helpers. Frozen to prevent accidental mutation across concurrent + coroutines. + """ + nodata: float | int | None out_dtype: np.dtype dtype_was_explicit: bool nodata_was_explicit: bool - store: Store | None - path_fn: Callable[[str], str] | None warp_cache: dict[tuple[tuple[float, ...], CRS], WarpMap] | None @@ -313,7 +324,7 @@ def _native_window( async def _open_and_window( item: dict, band: str, - ctx: _ChunkContext, + ctx: _WindowContext, ) -> tuple[GeoTIFF, GeoTIFF | Overview, Window | None, str] | None: """Open a COG asset and compute the pixel window covering the chunk. diff --git a/src/lazycogs/_explain.py b/src/lazycogs/_explain.py index dc540ef..3a9bec8 100644 --- a/src/lazycogs/_explain.py +++ b/src/lazycogs/_explain.py @@ -16,7 +16,7 @@ from xarray.core import indexing from lazycogs._backend import MultiBandStacBackendArray -from lazycogs._chunk_reader import _ChunkContext, _open_and_window +from lazycogs._chunk_reader import _open_and_window, _WindowContext from lazycogs._executor import run_duckdb, run_on_loop if TYPE_CHECKING: @@ -588,15 +588,13 @@ async def _inspect_item_async( the item has no matching asset or the chunk does not overlap. """ - ctx = _ChunkContext( + ctx = _WindowContext( chunk_affine=chunk_affine, dst_crs=dst_crs, chunk_width=chunk_width, chunk_height=chunk_height, - nodata=None, store=store, path_fn=None, - warp_cache=None, ) opened = await _open_and_window(item, band, ctx) if opened is None: diff --git a/tests/benchmarks/bench_pipeline.py b/tests/benchmarks/bench_pipeline.py index 529d47f..e41b4e0 100644 --- a/tests/benchmarks/bench_pipeline.py +++ b/tests/benchmarks/bench_pipeline.py @@ -58,6 +58,22 @@ def run() -> object: benchmark(run) +@pytest.mark.benchmark +def test_explain_fetch_headers(benchmark, benchmark_parquet: str) -> None: + """Explain plan with COG header opens: open + explain(fetch_headers=True).""" + + def run() -> object: + da = lazycogs.open( + benchmark_parquet, + bbox=BENCHMARK_BBOX, + crs=BENCHMARK_CRS, + resolution=60.0, + ) + return da.lazycogs.explain(fetch_headers=True) + + benchmark(run) + + @pytest.mark.benchmark @pytest.mark.parametrize("method", [FirstMethod, MedianMethod], ids=["first", "median"]) def test_mosaic_method( diff --git a/tests/benchmarks/test_regressions.py b/tests/benchmarks/test_regressions.py index 9b9402b..8736713 100644 --- a/tests/benchmarks/test_regressions.py +++ b/tests/benchmarks/test_regressions.py @@ -140,6 +140,28 @@ def test_open_explicit_overrides_win_over_benchmark_inference( assert da.encoding["_FillValue"] == -9999 +def test_explain_fetch_headers_uses_local_benchmark_data( + benchmark_parquet: str, +) -> None: + """Header-fetch explain works on real local COG metadata without pixel reads.""" + da = lazycogs.open( + benchmark_parquet, + bbox=BENCHMARK_BBOX, + crs=BENCHMARK_CRS, + resolution=60.0, + bands=BENCHMARK_SINGLE_BAND, + ) + + plan = da.lazycogs.explain(fetch_headers=True) + df = plan.to_dataframe() + + assert plan.fetch_headers is True + assert plan.total_cog_reads > 0 + assert df["window_width"].notna().any() + assert df["window_height"].notna().any() + assert df["overview_resolution"].notna().any() + + def test_open_rejects_conflicting_sampled_nodata_on_local_benchmark_copy( tmp_path: Path, benchmark_items: list[dict[str, Any]], diff --git a/tests/test_chunk_reader.py b/tests/test_chunk_reader.py index 19b4e41..3de130a 100644 --- a/tests/test_chunk_reader.py +++ b/tests/test_chunk_reader.py @@ -3,7 +3,8 @@ from __future__ import annotations import asyncio -from unittest.mock import MagicMock, patch +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch import numpy as np import pytest @@ -12,9 +13,12 @@ from lazycogs._chunk_reader import ( _apply_bands_with_warp_cache, + _ChunkContext, _drain_in_order, _native_window, + _open_and_window, _select_overview, + _WindowContext, read_chunk_async, ) from lazycogs._executor import get_reproject_pool @@ -163,6 +167,79 @@ def test_native_window_clamped_to_image_bounds(): assert win.row_off + win.height <= 4 # clamped at image height +# --------------------------------------------------------------------------- +# _open_and_window context boundary +# --------------------------------------------------------------------------- + + +def test_chunk_context_is_a_window_context(): + """Read context is accepted anywhere header/window helpers expect context.""" + ctx = _ChunkContext( + chunk_affine=Affine(1.0, 0.0, 0.0, 0.0, -1.0, 4.0), + dst_crs=CRS.from_epsg(4326), + chunk_width=4, + chunk_height=4, + store=None, + path_fn=None, + nodata=None, + out_dtype=np.dtype("float32"), + dtype_was_explicit=False, + nodata_was_explicit=False, + warp_cache=None, + ) + + assert isinstance(ctx, _WindowContext) + assert ctx.chunk_width == 4 + assert ctx.chunk_height == 4 + + +def test_open_and_window_accepts_chunk_context(): + """_open_and_window uses only the stable header/window context fields.""" + ctx = _ChunkContext( + chunk_affine=Affine(1.0, 0.0, 0.0, 0.0, -1.0, 4.0), + dst_crs=CRS.from_epsg(4326), + chunk_width=4, + chunk_height=4, + store=None, + path_fn=None, + nodata=None, + out_dtype=np.dtype("float32"), + dtype_was_explicit=False, + nodata_was_explicit=False, + warp_cache=None, + ) + geotiff = SimpleNamespace( + crs=ctx.dst_crs, + overviews=[], + transform=ctx.chunk_affine, + width=4, + height=4, + ) + item = {"id": "item-0", "assets": {"red": {"href": "file:///tmp/red.tif"}}} + + with ( + patch( + "lazycogs._chunk_reader._resolve_store", + return_value=(None, "/tmp/red.tif"), + ), + patch( + "lazycogs._chunk_reader.GeoTIFF.open", + new_callable=AsyncMock, + return_value=geotiff, + ), + ): + opened = asyncio.run(_open_and_window(item, "red", ctx)) + + assert opened is not None + opened_geotiff, reader, window, path = opened + assert opened_geotiff is geotiff + assert reader is geotiff + assert window is not None + assert window.width == 4 + assert window.height == 4 + assert path == "/tmp/red.tif" + + # --------------------------------------------------------------------------- # read_chunk_async concurrency # --------------------------------------------------------------------------- diff --git a/tests/test_explain.py b/tests/test_explain.py index f25f774..3af91f9 100644 --- a/tests/test_explain.py +++ b/tests/test_explain.py @@ -2,6 +2,8 @@ from __future__ import annotations +import asyncio +from types import SimpleNamespace from unittest.mock import AsyncMock, patch import numpy as np @@ -13,6 +15,7 @@ from xarray.core import indexing from lazycogs._backend import MultiBandStacBackendArray +from lazycogs._chunk_reader import _ChunkContext, _WindowContext from lazycogs._explain import ( ChunkRead, CogRead, @@ -20,6 +23,7 @@ _compute_chunk_bbox_4326, _find_backend_array, _infer_chunk_sizes, + _inspect_item_async, _iter_spatial_chunks, _roi_pixel_offsets, ) @@ -458,6 +462,49 @@ def _fake_items(band: str, n: int) -> list[dict]: ] +def test_inspect_item_async_builds_window_context_for_header_fetch(wgs84): + """Header fetch explain path does not require pixel-read contract fields.""" + chunk_affine = Affine(1.0, 0.0, 0.0, 0.0, -1.0, 10.0) + store = object() + geotiff = SimpleNamespace(overviews=[], transform=chunk_affine) + reader = geotiff + window = SimpleNamespace(col_off=1, row_off=2, width=3, height=4) + + with patch( + "lazycogs._explain._open_and_window", + new_callable=AsyncMock, + return_value=(geotiff, reader, window, "s3://bucket/item-0.tif"), + ) as open_and_window: + read = asyncio.run( + _inspect_item_async( + _fake_items("red", 1)[0], + "red", + chunk_affine, + wgs84, + 10, + 10, + store, + ), + ) + + _, _, ctx = open_and_window.call_args.args + assert isinstance(ctx, _WindowContext) + assert not isinstance(ctx, _ChunkContext) + assert ctx.chunk_affine == chunk_affine + assert ctx.dst_crs == wgs84 + assert ctx.chunk_width == 10 + assert ctx.chunk_height == 10 + assert ctx.store is store + assert ctx.path_fn is None + assert not hasattr(ctx, "out_dtype") + assert not hasattr(ctx, "nodata") + assert read is not None + assert read.window_col_off == 1 + assert read.window_row_off == 2 + assert read.window_width == 3 + assert read.window_height == 4 + + def test_accessor_raises_on_non_stac_da(): """explain() raises ValueError when the array is not lazycogs-backed.""" da = xr.DataArray(np.zeros((3, 3)))