diff --git a/README.md b/README.md index da9dc8f..61cfd60 100644 --- a/README.md +++ b/README.md @@ -134,8 +134,10 @@ podman run -it --rm --entrypoint="" sunflow bash ### Arguments - `--dataset` - Choose between KNMI or DWD data sources -- `--bbox` - Predefined bounding boxes (DENMARK, NW_EUROPE, CUSTOM) -- `--custom-bbox` - Custom bounding box (lon_min,lat_min,lon_max,lat_max) +- `--domain_satellite` - Domain for required satellite input coverage (DENMARK, NW_EUROPE, NW_EUROPE_SATELLITE, CUSTOM) +- `--custom-domain-satellite` - Custom `domain_satellite` (lon_min,lat_min,lon_max,lat_max) +- `--domain_nowcast` - Domain written to output (DENMARK, NW_EUROPE, NW_EUROPE_SATELLITE, CUSTOM, defaults to `--domain_satellite`) +- `--custom-domain-nowcast` - Custom `domain_nowcast` (lon_min,lat_min,lon_max,lat_max) - `--time` - Specific time for processing in ISO8601 format - `--start-time` - Start of a time range in ISO8601 format (use with `--end-time`) - `--end-time` - End of a time range in ISO8601 format, inclusive (use with `--start-time`) diff --git a/config.yaml b/config.yaml index da1625d..a88b27f 100644 --- a/config.yaml +++ b/config.yaml @@ -10,12 +10,12 @@ KNMI: sds: sds sds_cs: sds_cs # Required: Filename format for input data files - # filename_format: "{dataset_name}_{timestamp}_{bbox_choice}.nc" + # filename_format: "{dataset_name}_{timestamp}_{domain_satellite_choice}.nc" # Format for PDS data: filename_format: "NetCDF4_sds_{pds_timestamp}.nc" # Available template variables: # {dataset_name}: Dataset name (e.g., KNMI) - # {bbox_choice}: Bounding box identifier (e.g., NW_EUROPE) + # {domain_satellite_choice}: Satellite domain identifier (e.g., NW_EUROPE_SATELLITE) # {timestamp}: Compact format YYYYMMDDHHMM # {pds_timestamp}: PDS format YYYY-MM-DDTHH_MM_SSZ # {year}: Four-digit year (e.g., 2026) @@ -33,5 +33,5 @@ DWD: sds: SIS sds_cs: SISc # Required: Filename format for input data files - # Supports subdirectories via path separators, e.g.: "{year}/{month}/{day}/{dataset_name}_{timestamp}_{bbox_choice}.nc" - filename_format: "{dataset_name}_{timestamp}_{bbox_choice}.nc" + # Supports subdirectories via path separators, e.g.: "{year}/{month}/{day}/{dataset_name}_{timestamp}_{domain_satellite_choice}.nc" + filename_format: "{dataset_name}_{timestamp}_{domain_satellite_choice}.nc" diff --git a/sunflow/config.py b/sunflow/config.py index 6cb2e7f..0d53d9a 100644 --- a/sunflow/config.py +++ b/sunflow/config.py @@ -3,10 +3,12 @@ from dataclasses import dataclass from typing import Self -# Predefined Bounding Box (BBOX) options -BBOX_OPTIONS: dict[str, str | None] = { +# Predefined domain options +# Format: lon_min,lat_min,lon_max,lat_max +DOMAIN_OPTIONS: dict[str, str | None] = { "DENMARK": "4,50,18,62", "NW_EUROPE": "-10.75,47.25,20,63.5", + "NW_EUROPE_SATELLITE": "-20.75,37.25,30,73.5", "CUSTOM": None, } diff --git a/sunflow/data_io.py b/sunflow/data_io.py index 70973d3..61a838e 100644 --- a/sunflow/data_io.py +++ b/sunflow/data_io.py @@ -11,7 +11,7 @@ from loguru import logger from .config import NowcastConfig, S3Config -from .geospatial import subset_to_bbox +from .geospatial import subset_to_bbox, validate_dataset_covers_domain from .validation import DataNotAvailableError @@ -19,9 +19,9 @@ def fetch_current_data_with_retry( time_step: datetime, run_mode: str, config: dict[str, Any], - bbox: str, + domain_satellite: str, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, nowcast_config: NowcastConfig, s3_config: S3Config, custom_time: bool = False, @@ -32,9 +32,9 @@ def fetch_current_data_with_retry( time_step: Datetime object for data to fetch run_mode: One of 'download', 'files', or 's3' config: Dataset configuration dict - bbox: Bounding box string + domain_satellite: Domain string for required satellite input coverage dataset_name: Name of dataset (options: KNMI, DWD) - bbox_choice: Bounding box choice string + domain_satellite_choice: Domain choice used in input filenames nowcast_config: NowcastConfig object s3_config: S3Config object custom_time: Whether a custom time was specified (no retry if True) @@ -57,26 +57,28 @@ def fetch_current_data_with_retry( download_current_data( time_step, config, - bbox, + domain_satellite, dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config.satellite_data_directory, ) case "files": check_current_data_existence_file( time_step, dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config.satellite_data_directory, config["filename_format"], + domain_satellite, ) case "s3": check_current_data_existence_s3( time_step, dataset_name, - bbox_choice, + domain_satellite_choice, s3_config, config["filename_format"], + domain_satellite, ) logger.info(f"Data successfully retrieved for {time_step_str}") @@ -108,7 +110,7 @@ def fetch_current_data_with_retry( def generate_input_filename( time_step: datetime, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, filename_format: str, ) -> str: """Generate input filename based on a format template string. @@ -118,7 +120,7 @@ def generate_input_filename( Args: time_step: Datetime of the data timestep. dataset_name: Name of dataset (options: KNMI, DWD). - bbox_choice: Bounding box identifier. + domain_satellite_choice: Satellite domain identifier. filename_format: Template string from config['filename_format']. Returns: @@ -126,7 +128,7 @@ def generate_input_filename( Template variables supported: {dataset_name}: Name of the dataset - {bbox_choice}: Bounding box identifier + {domain_satellite_choice}: Satellite domain identifier {timestamp}: Compact format YYYYMMDDHHMM {pds_timestamp}: PDS format YYYY-MM-DDTHH_MM_SSZ {year}: Four-digit year (e.g. 2026) @@ -138,16 +140,14 @@ def generate_input_filename( ``{year}/{month}/{day}/{dataset_name}_{timestamp}.nc`` resolves to a file inside a date-structured subdirectory of *satellite_data_directory*. """ - format_template = filename_format - # Generate both time formats timestamp_compact = time_step.strftime("%Y%m%d%H%M") timestamp_pds = time_step.strftime("%Y-%m-%dT%H_%M_%SZ") # Substitute template variables - filename = format_template.format( + filename = filename_format.format( dataset_name=dataset_name, - bbox_choice=bbox_choice, + domain_satellite_choice=domain_satellite_choice, timestamp=timestamp_compact, pds_timestamp=timestamp_pds, year=time_step.strftime("%Y"), @@ -161,35 +161,46 @@ def generate_input_filename( def check_current_data_existence_file( request_time: datetime, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, satellite_data_directory: str, filename_format: str, + required_domain: str | None = None, ) -> None: """Check for existence of current data file. Args: request_time: Python datetime object. dataset_name: Name of dataset (options: KNMI, DWD). - bbox_choice: Bounding box identifier. + domain_satellite_choice: Satellite domain identifier. satellite_data_directory: Directory containing data files. filename_format: Template string from config['filename_format']. + required_domain: Optional domain string that must be covered by the file. Raises: FileNotFoundError: If the expected file does not exist. + RuntimeError: If required_domain is provided and file coverage is insufficient. """ filename = generate_input_filename( - request_time, dataset_name, bbox_choice, filename_format + request_time, dataset_name, domain_satellite_choice, filename_format ) filepath = os.path.join(satellite_data_directory, filename) logger.info(f"Checking existence of data at {filepath}") if not os.path.exists(filepath): raise FileNotFoundError(f"Input file not found: {filepath}") + if required_domain: + with xr.open_dataset(filepath) as ds: + validate_dataset_covers_domain( + ds, + required_domain, + f"Input file {filepath}", + ) + def load_data_from_files( time_steps: list[datetime], dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, satellite_data_directory: str, data_type: str, filename_format: str, @@ -200,7 +211,7 @@ def load_data_from_files( Args: time_steps: List of timesteps to load. dataset_name: Name of dataset (options: KNMI, DWD). - bbox_choice: Bounding box identifier. + domain_satellite_choice: Satellite domain identifier. satellite_data_directory: Directory containing data files. data_type: Type of data for logging (options: past data, clearsky data). filename_format: Template string from config['filename_format']. @@ -215,7 +226,7 @@ def load_data_from_files( for time_step in time_steps: filename = generate_input_filename( - time_step, dataset_name, bbox_choice, filename_format + time_step, dataset_name, domain_satellite_choice, filename_format ) filepath = os.path.join(satellite_data_directory, filename) @@ -241,24 +252,27 @@ def load_data_from_files( def check_current_data_existence_s3( request_time: datetime, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, s3_config: S3Config, filename_format: str, + required_domain: str | None = None, ) -> None: """Check for existence of current data file in S3. Args: request_time: Python datetime object. dataset_name: Name of dataset (options: KNMI, DWD). - bbox_choice: Bounding box identifier. + domain_satellite_choice: Satellite domain identifier. s3_config: S3 configuration object. filename_format: Template string from config['filename_format']. + required_domain: Optional domain string that must be covered by the file. Raises: FileNotFoundError: If the expected file does not exist in S3. + RuntimeError: If required_domain is provided and file coverage is insufficient. """ filename = generate_input_filename( - request_time, dataset_name, bbox_choice, filename_format + request_time, dataset_name, domain_satellite_choice, filename_format ) s3_path = f"s3://{s3_config.bucket}/{s3_config.input_prefix}/{filename}" logger.info(f"Checking existence of data at {s3_path}") @@ -270,6 +284,16 @@ def check_current_data_existence_s3( ) if not fs.exists(s3_path): raise FileNotFoundError(f"Input file not yet found in S3: {s3_path}") + + if required_domain: + with fs.open(s3_path, "rb") as f: + with xr.open_dataset(f, engine="h5netcdf") as ds: + ds_loaded = ds.load() + validate_dataset_covers_domain( + ds_loaded, + required_domain, + f"Input file {s3_path}", + ) except FileNotFoundError: raise except Exception as e: @@ -280,7 +304,7 @@ def check_current_data_existence_s3( def load_data_from_s3( time_steps: list[datetime], dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, s3_config: S3Config, data_type: str, filename_format: str, @@ -291,7 +315,7 @@ def load_data_from_s3( Args: time_steps: List of timesteps to load. dataset_name: Name of dataset (options: KNMI, DWD). - bbox_choice: Bounding box identifier. + domain_satellite_choice: Satellite domain identifier. s3_config: S3 configuration object. data_type: Type of data for logging (options: past data, clearsky data). filename_format: Template string from config['filename_format']. @@ -310,7 +334,7 @@ def load_data_from_s3( for time_step in time_steps: filename = generate_input_filename( - time_step, dataset_name, bbox_choice, filename_format + time_step, dataset_name, domain_satellite_choice, filename_format ) s3_path = f"s3://{s3_config.bucket}/{s3_config.input_prefix}/{filename}" @@ -340,7 +364,7 @@ def fetch_clearsky_with_fallback( config: dict[str, Any], bbox: str, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, nowcast_config: NowcastConfig, s3_config: S3Config, ) -> xr.Dataset: @@ -358,7 +382,7 @@ def fetch_clearsky_with_fallback( config: Dataset configuration dict. bbox: Bounding box string. dataset_name: Name of dataset (options: KNMI, DWD). - bbox_choice: Bounding box identifier. + domain_satellite_choice: Satellite domain identifier. nowcast_config: NowcastConfig object. s3_config: S3Config object. @@ -393,7 +417,7 @@ def fetch_clearsky_with_fallback( fetched = load_data_from_files( [source_time], dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config.satellite_data_directory, "clearsky data", config["filename_format"], @@ -403,7 +427,7 @@ def fetch_clearsky_with_fallback( fetched = load_data_from_s3( [source_time], dataset_name, - bbox_choice, + domain_satellite_choice, s3_config, "clearsky data", config["filename_format"], diff --git a/sunflow/downloaders.py b/sunflow/downloaders.py index 0867160..0f150f1 100644 --- a/sunflow/downloaders.py +++ b/sunflow/downloaders.py @@ -72,7 +72,7 @@ def download_current_data( config: dict[str, Any], bbox: str, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, satellite_data_directory: str, ) -> None: """Download satellite data for a single timestep and save it to disk. @@ -92,7 +92,7 @@ def download_current_data( base_url, variables, format, crs, and filename_format. bbox: Bounding box string lon_min,lat_min,lon_max,lat_max. dataset_name: Name of the dataset source (options: KNMI, DWD). - bbox_choice: Bounding box identifier used in the output filename. + domain_satellite_choice: Satellite domain identifier used in the output filename. satellite_data_directory: Directory where the NetCDF file is saved. """ if dataset_name == "KNMI": @@ -120,7 +120,10 @@ def download_current_data( # Save data current_time_dt = current_time.astype("datetime64[s]").astype(datetime) filename = generate_input_filename( - current_time_dt, dataset_name, bbox_choice, config["filename_format"] + current_time_dt, + dataset_name, + domain_satellite_choice, + config["filename_format"], ) output_path = os.path.join(satellite_data_directory, filename) merged_ds.to_netcdf(output_path) diff --git a/sunflow/geospatial.py b/sunflow/geospatial.py index 80712ff..7418aaf 100644 --- a/sunflow/geospatial.py +++ b/sunflow/geospatial.py @@ -6,7 +6,10 @@ import xarray as xr from loguru import logger -from .config import BBOX_OPTIONS +from .config import DOMAIN_OPTIONS + +# Tiny absolute tolerance for floating-point boundary comparisons in degrees. +COVERAGE_ABS_TOL_DEGREES = 1e-9 def subset_to_bbox(ds: xr.Dataset, bbox: str) -> xr.Dataset: @@ -43,23 +46,64 @@ def subset_to_bbox(ds: xr.Dataset, bbox: str) -> xr.Dataset: ) -def get_bbox(bbox_choice: str, custom_bbox: str | None = None) -> str | None: - """Return the bounding box string for a given bbox choice. +def resolve_domain_bbox( + domain_choice: str, + custom_domain: str | None = None, +) -> str | None: + """Return the bbox string for a given domain choice. - For predefined choices ('DENMARK', 'NW_EUROPE'), looks up - the value in BBOX_OPTIONS. For 'CUSTOM', returns custom_bbox - directly. + For predefined choices ('DENMARK', 'NW_EUROPE'), looks up the value in + DOMAIN_OPTIONS. For 'CUSTOM', returns custom_domain directly. Args: - bbox_choice: One of 'DENMARK', 'NW_EUROPE', or 'CUSTOM'. - custom_bbox: Bbox string lon_min,lat_min,lon_max,lat_max used - when bbox_choice='CUSTOM'. Default None. + domain_choice: One of 'DENMARK', 'NW_EUROPE', or 'CUSTOM'. + custom_domain: Bbox string lon_min,lat_min,lon_max,lat_max used + when domain_choice='CUSTOM'. Returns: Bounding box string, or None if the predefined choice has no associated value. """ - return custom_bbox if bbox_choice == "CUSTOM" else BBOX_OPTIONS[bbox_choice] + return custom_domain if domain_choice == "CUSTOM" else DOMAIN_OPTIONS[domain_choice] + + +def parse_bbox(bbox: str) -> tuple[float, float, float, float]: + """Parse and validate a bbox string. + + Args: + bbox: Bounding box string in format lon_min,lat_min,lon_max,lat_max. + + Returns: + Tuple (lon_min, lat_min, lon_max, lat_max). + + Raises: + ValueError: If the format is invalid or bounds are not ordered. + """ + parts = bbox.split(",") + if len(parts) != 4: + raise ValueError("Must have exactly 4 comma-separated values") + + lon_min, lat_min, lon_max, lat_max = [float(x) for x in parts] + + if lon_min >= lon_max: + raise ValueError("lon_min must be smaller than lon_max") + if lat_min >= lat_max: + raise ValueError("lat_min must be smaller than lat_max") + + return lon_min, lat_min, lon_max, lat_max + + +def domain_contains(outer_bbox: str, inner_bbox: str) -> bool: + """Return True if outer_bbox fully contains inner_bbox.""" + outer_lon_min, outer_lat_min, outer_lon_max, outer_lat_max = parse_bbox(outer_bbox) + inner_lon_min, inner_lat_min, inner_lon_max, inner_lat_max = parse_bbox(inner_bbox) + + return ( + outer_lon_min <= inner_lon_min + and outer_lat_min <= inner_lat_min + and outer_lon_max >= inner_lon_max + and outer_lat_max >= inner_lat_max + ) def get_coordinates(ds: xr.Dataset) -> tuple[np.ndarray, np.ndarray]: @@ -95,6 +139,111 @@ def get_coordinates(ds: xr.Dataset) -> tuple[np.ndarray, np.ndarray]: return latitudes, longitudes +def infer_coordinate_edges(coords: np.ndarray) -> tuple[float, float]: + """Infer min/max coordinate edges from coordinate center values. + + For regularly gridded coordinates represented by cell centers, this + returns the outer cell edges by extending half a grid step beyond the + min/max center values. + + Args: + coords: 1-D coordinate center values. + + Returns: + Tuple (edge_min, edge_max). + + Raises: + RuntimeError: If coordinates are empty. + """ + if len(coords) == 0: + raise RuntimeError("Coordinate array cannot be empty.") + + coord_min = float(np.min(coords)) + coord_max = float(np.max(coords)) + + # A single coordinate has no resolvable spacing, so edge == center. + if len(coords) < 2: + return coord_min, coord_max + + # Use median absolute spacing to remain robust to ascending/descending order. + spacing = float(np.median(np.abs(np.diff(coords)))) + half_step = 0.5 * spacing + return coord_min - half_step, coord_max + half_step + + +def validate_dataset_covers_domain( + ds: xr.Dataset, + domain_bbox: str, + context: str, +) -> None: + """Validate that dataset coordinates fully cover a requested domain. + + Args: + ds: Input dataset containing latitude and longitude coordinates. + domain_bbox: Requested bbox string lon_min,lat_min,lon_max,lat_max. + context: Human-readable context for error messages. + + Raises: + RuntimeError: If dataset bounds do not fully contain requested domain. + """ + latitudes, longitudes = get_coordinates(ds) + lon_min, lat_min, lon_max, lat_max = parse_bbox(domain_bbox) + + data_lon_min, data_lon_max = infer_coordinate_edges(longitudes) + data_lat_min, data_lat_max = infer_coordinate_edges(latitudes) + + if ( + data_lon_min > lon_min + COVERAGE_ABS_TOL_DEGREES + or data_lat_min > lat_min + COVERAGE_ABS_TOL_DEGREES + or data_lon_max < lon_max - COVERAGE_ABS_TOL_DEGREES + or data_lat_max < lat_max - COVERAGE_ABS_TOL_DEGREES + ): + raise RuntimeError( + f"{context} does not cover requested domain_satellite={domain_bbox}. " + f"Available bounds are " + f"{data_lon_min},{data_lat_min},{data_lon_max},{data_lat_max}." + ) + + +def crop_forecast_to_domain( + forecast: np.ndarray, + latitudes: np.ndarray, + longitudes: np.ndarray, + domain_bbox: str, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """Crop a [time, lat, lon] forecast and its coordinates to a domain bbox. + + Args: + forecast: Forecast array with shape [time, lat, lon]. + latitudes: 1-D latitude array. + longitudes: 1-D longitude array. + domain_bbox: Requested bbox string lon_min,lat_min,lon_max,lat_max. + + Returns: + Tuple (cropped_forecast, cropped_latitudes, cropped_longitudes). + + Raises: + RuntimeError: If forecast dimensionality is not [time, lat, lon] or the + requested domain has no overlap with the provided coordinates. + """ + if forecast.ndim != 3: + raise RuntimeError( + f"Expected forecast shape [time, lat, lon], got {forecast.shape}." + ) + + lon_min, lat_min, lon_max, lat_max = parse_bbox(domain_bbox) + lat_idx = np.where((latitudes >= lat_min) & (latitudes <= lat_max))[0] + lon_idx = np.where((longitudes >= lon_min) & (longitudes <= lon_max))[0] + + if len(lat_idx) == 0 or len(lon_idx) == 0: + raise RuntimeError( + f"Requested domain_nowcast={domain_bbox} does not overlap forecast grid." + ) + + cropped_forecast = forecast[:, lat_idx, :][:, :, lon_idx] + return cropped_forecast, latitudes[lat_idx], longitudes[lon_idx] + + def check_solar_elevation( time: datetime, lat: float = 55.6761, diff --git a/sunflow/main.py b/sunflow/main.py index 6034bab..2d881ed 100644 --- a/sunflow/main.py +++ b/sunflow/main.py @@ -13,7 +13,7 @@ from pysteps.motion.lucaskanade import dense_lucaskanade from . import __version__ -from .config import NowcastConfig, S3Config +from .config import DOMAIN_OPTIONS, NowcastConfig, S3Config from .data_io import ( fetch_clearsky_with_fallback, fetch_current_data_with_retry, @@ -23,7 +23,14 @@ ) from .downloaders import download_past_data from .forecast import multiply_clearsky, preprocess_data, simple_advection_forecast -from .geospatial import check_solar_elevation, get_bbox +from .geospatial import ( + check_solar_elevation, + crop_forecast_to_domain, + domain_contains, + parse_bbox, + resolve_domain_bbox, + validate_dataset_covers_domain, +) from .time_handler import generate_time_steps, round_time from .validation import ( MissingClearskyDataError, @@ -52,18 +59,18 @@ class RunResult: # Model version model_version = __version__ +DOMAIN_CHOICES = tuple(DOMAIN_OPTIONS) def parse_arguments() -> argparse.Namespace: """Parse and validate command line arguments. - Defines arguments for run mode, dataset, bounding box, and an optional - custom time override. Validates that --custom-bbox is provided and - correctly formatted when --bbox=CUSTOM is selected. + Defines arguments for run mode, dataset, spatial domains, and an optional + custom time override. Returns: Parsed argument namespace with attributes run_mode, dataset, - bbox, custom_bbox, and time. + domain_satellite, domain_nowcast, and time. """ def parse_datetime_with_timezone(datetime_str: str) -> datetime: @@ -104,15 +111,30 @@ def parse_datetime_with_timezone(datetime_str: str) -> datetime: "(default: KNMI)", ) parser.add_argument( - "--bbox", - choices=["DENMARK", "NW_EUROPE", "CUSTOM"], + "--domain_satellite", + choices=DOMAIN_CHOICES, default="NW_EUROPE", - help="Choose bounding box (default: NW_EUROPE)", + help="Domain required for satellite input coverage (default: NW_EUROPE)", + ) + parser.add_argument( + "--custom_domain_satellite", + type=str, + help='Custom domain_satellite in format "lon_min,lat_min,lon_max,lat_max"', + default=None, + ) + parser.add_argument( + "--domain_nowcast", + choices=DOMAIN_CHOICES, + default=None, + help=( + "Domain written to forecast output. Defaults to domain_satellite " + "when omitted" + ), ) parser.add_argument( - "--custom-bbox", + "--custom_domain_nowcast", type=str, - help='Custom bbox in format "lon_min,lat_min,lon_max,lat_max"', + help='Custom domain_nowcast in format "lon_min,lat_min,lon_max,lat_max"', default=None, ) parser.add_argument( @@ -122,15 +144,15 @@ def parse_datetime_with_timezone(datetime_str: str) -> datetime: default=None, ) parser.add_argument( - "--start-time", + "--start_time", type=parse_datetime_with_timezone, - help="Start of time span in ISO8601 format. Use with --end-time.", + help="Start of time span in ISO8601 format. Use with --end_time.", default=None, ) parser.add_argument( - "--end-time", + "--end_time", type=parse_datetime_with_timezone, - help="End of time span in ISO8601 format (inclusive). Use with --start-time.", + help="End of time span in ISO8601 format (inclusive). Use with --start_time.", default=None, ) @@ -138,27 +160,47 @@ def parse_datetime_with_timezone(datetime_str: str) -> datetime: # Validate time arguments if args.time and (args.start_time or args.end_time): - parser.error("--time cannot be combined with --start-time/--end-time") + parser.error("--time cannot be combined with --start_time/--end_time") if bool(args.start_time) != bool(args.end_time): - parser.error("--start-time and --end-time must be provided together") + parser.error("--start_time and --end_time must be provided together") if args.start_time and args.end_time and args.start_time > args.end_time: - parser.error("--start-time must be before --end-time") + parser.error("--start_time must be before --end_time") + + def validate_custom_domain( + domain_choice: str | None, + custom_domain: str | None, + domain_arg: str, + custom_arg: str, + ) -> None: + if domain_choice == "CUSTOM": + if not custom_domain: + parser.error(f"{custom_arg} is required when {domain_arg}=CUSTOM") + try: + parse_bbox(custom_domain) + except ValueError as e: + parser.error( + f"Invalid {custom_arg} format: {e}. " + "Use format 'lon_min,lat_min,lon_max,lat_max'" + ) + elif custom_domain: + parser.error(f"{custom_arg} is only valid when {domain_arg}=CUSTOM") + + validate_custom_domain( + args.domain_satellite, + args.custom_domain_satellite, + "--domain_satellite", + "--custom_domain_satellite", + ) - # Validate custom bbox - if args.bbox == "CUSTOM": - if not args.custom_bbox: - parser.error("--custom-bbox is required when --bbox=CUSTOM") - try: - # Validate format by trying to parse - bbox_parts = args.custom_bbox.split(",") - if len(bbox_parts) != 4: - raise ValueError("Must have exactly 4 comma-separated values") - [float(x) for x in bbox_parts] # Ensure all are numeric - except ValueError as e: - parser.error( - f"Invalid --custom-bbox format: {e}. " - "Use format 'lon_min,lat_min,lon_max,lat_max'" - ) + if args.domain_nowcast is None and args.custom_domain_nowcast: + parser.error("--custom_domain_nowcast requires --domain_nowcast=CUSTOM") + + validate_custom_domain( + args.domain_nowcast, + args.custom_domain_nowcast, + "--domain_nowcast", + "--custom_domain_nowcast", + ) return args @@ -167,9 +209,10 @@ def run_nowcast( time_step: datetime, run_mode: str, config: dict, - bbox: str, + domain_satellite: str, + domain_nowcast: str, dataset_name: str, - bbox_choice: str, + domain_satellite_choice: str, nowcast_config: NowcastConfig, s3_config: S3Config, custom_time: bool = True, @@ -180,9 +223,10 @@ def run_nowcast( time_step: The time step to produce a forecast for. run_mode: One of 'download', 'files', or 's3'. config: Dataset configuration dict. - bbox: Bounding box string. + domain_satellite: Domain string used for satellite input coverage. + domain_nowcast: Domain string used for output cropping. dataset_name: Name of dataset. - bbox_choice: Bounding box identifier. + domain_satellite_choice: Domain identifier used for input filenames. nowcast_config: NowcastConfig object. s3_config: S3Config object. custom_time: If True, skip the retry wait loop on missing data. @@ -199,9 +243,9 @@ def run_nowcast( time_step, run_mode, config, - bbox, + domain_satellite, dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config, s3_config, custom_time=custom_time, @@ -234,32 +278,39 @@ def run_nowcast( logger.info(f"Loading past data for {len(past_time_steps)} time steps...") match run_mode: case "download": - data = download_past_data(past_time_steps, config, bbox, dataset_name) + data = download_past_data( + past_time_steps, + config, + domain_satellite, + dataset_name, + ) case "files": data = load_data_from_files( past_time_steps, dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config.satellite_data_directory, "past data", config["filename_format"], - bbox=bbox, + bbox=domain_satellite, ) case "s3": data = load_data_from_s3( past_time_steps, dataset_name, - bbox_choice, + domain_satellite_choice, s3_config, "past data", config["filename_format"], - bbox=bbox, + bbox=domain_satellite, ) n_loaded = len(data.time) if "time" in data.coords else 0 logger.info(f"Loaded {n_loaded} past data timesteps") if n_loaded == 0: raise RuntimeError("No past data loaded. Cannot proceed.") + if run_mode in {"files", "s3"}: + validate_dataset_covers_domain(data, domain_satellite, "Input dataset") # Preprocess logger.info("Preprocessing data...") @@ -301,13 +352,20 @@ def run_nowcast( run_mode, nowcast_config.max_clearsky_fallback_days, config, - bbox, + domain_satellite, dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config, s3_config, ) + if run_mode in {"files", "s3"} and clearsky_data.sizes.get("time", 0) > 0: + validate_dataset_covers_domain( + clearsky_data, + domain_satellite, + "Clearsky dataset", + ) + # Validate clearsky data try: validate_clearsky_completeness(clearsky_data, previous_day_time_steps) @@ -338,6 +396,13 @@ def run_nowcast( solar_t0 = ratio_data[-1] * sds_cs_t0 solar_forecast = np.concatenate([solar_t0[np.newaxis, :, :], solar_forecast], axis=0) + solar_forecast, latitudes, longitudes = crop_forecast_to_domain( + solar_forecast, + latitudes, + longitudes, + domain_nowcast, + ) + # Save forecast (now contains actual solar irradiance, not ratios) filename = save_forecast( solar_forecast, @@ -399,8 +464,38 @@ def cli() -> None: run_mode = args.run_mode dataset_name = args.dataset - bbox_choice = args.bbox - bbox = get_bbox(bbox_choice, args.custom_bbox) + domain_satellite_choice = args.domain_satellite + domain_satellite = resolve_domain_bbox( + domain_satellite_choice, + args.custom_domain_satellite, + ) + if domain_satellite is None: + raise RuntimeError( + "domain_satellite could not be resolved. " + "Use a predefined choice or provide --custom_domain_satellite." + ) + + domain_nowcast_choice = args.domain_nowcast + if domain_nowcast_choice is None: + domain_nowcast_choice = domain_satellite_choice + domain_nowcast = domain_satellite + else: + domain_nowcast = resolve_domain_bbox( + domain_nowcast_choice, + args.custom_domain_nowcast, + ) + if domain_nowcast is None: + raise RuntimeError( + "domain_nowcast could not be resolved. " + "Use a predefined choice or provide --custom_domain_nowcast." + ) + + if not domain_contains(domain_satellite, domain_nowcast): + raise RuntimeError( + "domain_nowcast must be fully contained within domain_satellite. " + f"Got domain_satellite={domain_satellite}, " + f"domain_nowcast={domain_nowcast}." + ) config = yaml.safe_load(open("config.yaml"))[dataset_name] @@ -409,7 +504,8 @@ def cli() -> None: logger.info(f"Running in {run_mode} mode") logger.info(f"Using {dataset_name} dataset") - logger.info(f"Using {bbox_choice} bbox: {bbox}") + logger.info(f"Using satellite domain {domain_satellite_choice}: {domain_satellite}") + logger.info(f"Using nowcast domain {domain_nowcast_choice}: {domain_nowcast}") validate_run_mode(run_mode, dataset_name) validate_config(config, dataset_name) @@ -459,9 +555,10 @@ def cli() -> None: time_step, run_mode, config, - bbox, + domain_satellite, + domain_nowcast, dataset_name, - bbox_choice, + domain_satellite_choice, nowcast_config, s3_config, custom_time=custom_time,