diff --git a/CHANGELOG.md b/CHANGELOG.md index f33a333..5a55de2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Subsetting to bounding box is now also done in the `s3` and `files` code paths [!11](https://github.com/dmidk/sunflow/pull/11), @JoachimKoenigslieb +- Subsetting to bounding box now correctly handles both ascending and descending lat/lon coordinates [!11](https://github.com/dmidk/sunflow/pull/11), @JoachimKoenigslieb + ### Changed - Pass down number of ensembles from configurations to `ProbabilisticAdvection`. This gives a roughly 3x speedup in the no ensembles case [!12](https://github.com/dmidk/sunflow/pull/12), @JoachimKoenigslieb diff --git a/sunflow/data_io.py b/sunflow/data_io.py index c78a5cb..70973d3 100644 --- a/sunflow/data_io.py +++ b/sunflow/data_io.py @@ -11,6 +11,7 @@ from loguru import logger from .config import NowcastConfig, S3Config +from .geospatial import subset_to_bbox from .validation import DataNotAvailableError @@ -192,6 +193,7 @@ def load_data_from_files( satellite_data_directory: str, data_type: str, filename_format: str, + bbox: str, ) -> xr.Dataset: """Load time steps from files (operational mode). @@ -232,7 +234,8 @@ def load_data_from_files( if not collected: return xr.Dataset() - return xr.concat(collected, dim="time", data_vars=None) + ds = xr.concat(collected, dim="time", data_vars=None) + return subset_to_bbox(ds, bbox) def check_current_data_existence_s3( @@ -281,6 +284,7 @@ def load_data_from_s3( s3_config: S3Config, data_type: str, filename_format: str, + bbox: str, ) -> xr.Dataset: """Load time steps from S3. @@ -325,7 +329,8 @@ def load_data_from_s3( if not collected: return xr.Dataset() - return xr.concat(collected, dim="time", data_vars=None) + ds = xr.concat(collected, dim="time", data_vars=None) + return subset_to_bbox(ds, bbox) def fetch_clearsky_with_fallback( @@ -392,6 +397,7 @@ def fetch_clearsky_with_fallback( nowcast_config.satellite_data_directory, "clearsky data", config["filename_format"], + bbox=bbox, ) case "s3": fetched = load_data_from_s3( @@ -401,6 +407,7 @@ def fetch_clearsky_with_fallback( s3_config, "clearsky data", config["filename_format"], + bbox=bbox, ) if offset > 0: diff --git a/sunflow/downloaders.py b/sunflow/downloaders.py index 13a19f7..0867160 100644 --- a/sunflow/downloaders.py +++ b/sunflow/downloaders.py @@ -8,6 +8,7 @@ from loguru import logger from .data_io import generate_input_filename +from .geospatial import subset_to_bbox def download_netcdf_knmi(url: str, api_key: str) -> BytesIO: @@ -66,44 +67,6 @@ def generate_dwd_filename(timestamp: datetime) -> str: return f"SISin{timestamp.strftime('%Y%m%d%H%M')}FDv3.nc" -def subset_to_bbox(ds: xr.Dataset, bbox: str) -> xr.Dataset: - """Subset an xarray Dataset to a geographic bounding box. - - Detects the latitude and longitude coordinate names automatically by - matching against common aliases ('lat', 'latitude', 'y' for - latitude; 'lon', 'longitude', 'x' for longitude). If coordinates - cannot be identified, the original dataset is returned unchanged with - a warning. - - Args: - ds: Input dataset to subset. - bbox: Bounding box string in the format lon_min,lat_min,lon_max,lat_max. - - Returns: - Dataset sliced to the requested bounding box, or the original - dataset if lat/lon coordinates could not be found. - """ - lon_min, lat_min, lon_max, lat_max = map(float, bbox.split(",")) - - lat_coord = lon_coord = None - for coord in ds.coords: - if coord.lower() in ["lat", "latitude", "y"]: - lat_coord = coord - elif coord.lower() in ["lon", "longitude", "x"]: - lon_coord = coord - - if lat_coord is None or lon_coord is None: - logger.warning("Could not find lat/lon coordinates") - return ds - - return ds.sel( - { - lat_coord: slice(lat_min, lat_max), - lon_coord: slice(lon_min, lon_max), - } - ) - - def download_current_data( request_time: datetime, config: dict[str, Any], diff --git a/sunflow/geospatial.py b/sunflow/geospatial.py index 4cbe385..80712ff 100644 --- a/sunflow/geospatial.py +++ b/sunflow/geospatial.py @@ -4,10 +4,45 @@ import numpy as np import pvlib import xarray as xr +from loguru import logger from .config import BBOX_OPTIONS +def subset_to_bbox(ds: xr.Dataset, bbox: str) -> xr.Dataset: + """Subset an xarray Dataset to a geographic bounding box.""" + lon_min, lat_min, lon_max, lat_max = map(float, bbox.split(",")) + + lat_coord = lon_coord = None + for coord in ds.coords: + if coord.lower() in ["lat", "latitude", "y"]: + lat_coord = coord + elif coord.lower() in ["lon", "longitude", "x"]: + lon_coord = coord + + if lat_coord is None or lon_coord is None: + logger.warning("Could not find lat/lon coordinates for bbox subsetting") + return ds + + # Handle potentially inverted coordinates by detecting order + lat_values = ds[lat_coord].values + lon_values = ds[lon_coord].values + + lat_ascending = lat_values[0] < lat_values[-1] + lon_ascending = lon_values[0] < lon_values[-1] + + # Arrange slice bounds based on coordinate order + lat_slice = slice(lat_min, lat_max) if lat_ascending else slice(lat_max, lat_min) + lon_slice = slice(lon_min, lon_max) if lon_ascending else slice(lon_max, lon_min) + + return ds.sel( + { + lat_coord: lat_slice, + lon_coord: lon_slice, + } + ) + + def get_bbox(bbox_choice: str, custom_bbox: str | None = None) -> str | None: """Return the bounding box string for a given bbox choice. diff --git a/sunflow/main.py b/sunflow/main.py index 5ac6f63..d29b49a 100644 --- a/sunflow/main.py +++ b/sunflow/main.py @@ -242,6 +242,7 @@ def run_nowcast( nowcast_config.satellite_data_directory, "past data", config["filename_format"], + bbox=bbox, ) case "s3": data = load_data_from_s3( @@ -251,6 +252,7 @@ def run_nowcast( s3_config, "past data", config["filename_format"], + bbox=bbox, ) n_loaded = len(data.time) if "time" in data.coords else 0