diff --git a/pyproject.toml b/pyproject.toml index b328128..9d81071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "shapely>=2.0.0", "torch>=2.6.0", "zarr>=3.1.1", - "geopandas>=1.1.1", + "geopandas>=1.1.2", "rasterio>=1.4.3", ] diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 5252956..1c75ad2 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -1,7 +1,7 @@ import json from collections.abc import Iterable from pathlib import Path -from typing import TextIO +from typing import Any, TextIO import geopandas as gpd from geopandas import GeoDataFrame @@ -13,14 +13,63 @@ class GeoJSONParser: GeoJSON is a format for encoding geographic data structures using JSON. This parser supports both polygon and point geometries. + + Extended capabilities: + - Relational metadata integration: Maps properties from geometry-less definition + features to spatial annotation features via a shared join key (solve_relations). + + Expected relational schema for solve_relations: + FeatureCollection + ├── Feature (Definition) + │ ├── geometry: null + │ └── properties + │ ├── presetID: "a376..." <──────┐ (join_key) + │ └── meta: { "category": { "name": "Category", "value": "Healthy Tissue" } } + └── Feature (Annotation) │ + ├── geometry: { "type": "Polygon" } │ + └── properties │ + └── presetID: "a376..." <──────┘ """ - def __init__(self, file_path: Path | str | TextIO) -> None: + def __init__( + self, file_path: Path | str | TextIO, join_key: str | None = "presetID" + ) -> None: self.gdf = gpd.read_file(file_path) if not self.gdf.empty: - # Explode Multi-part geometries to simplify geometry handling - self.gdf = self.gdf.explode(index_parts=True) + has_geometry = ~(self.gdf.geometry.is_empty | self.gdf.geometry.isna()) + annotations = self.gdf[has_geometry].explode(index_parts=True) + definitions = self.gdf[~has_geometry] + + if join_key in self.gdf.columns and not definitions.empty: + self.gdf = self._solve_relations(annotations, definitions, join_key) # type: ignore[arg-type] + else: + self.gdf = annotations + + @staticmethod + def _solve_relations( + annotations: GeoDataFrame, definitions: GeoDataFrame, join_key: str + ) -> GeoDataFrame: + """Merge definition properties into annotations using the join key. + + Columns that exist only in the definitions are folded into the result. + Columns that exist in both get a ``_def`` suffix for the definition side. + """ + # Drop all-null columns from annotations so they don't shadow definition values + ann_null_cols = [ + c + for c in annotations.columns + if c != "geometry" and c != join_key and annotations[c].isna().all() + ] + annotations_clean = annotations.drop(columns=ann_null_cols) + + merged = annotations_clean.merge( + definitions.drop(columns=["geometry"]), + on=join_key, + how="left", + suffixes=("", "_def"), + ) + return merged def get_filtered_geodataframe( self, separator: str = "_", **kwargs: str @@ -43,17 +92,13 @@ def get_filtered_geodataframe( # If the first part of the key doesn't exist, return an empty frame return self.gdf.iloc[0:0] - series = filtered_gdf[subkeys[0]].astype(str) - if len(subkeys) > 1: - mask = series.apply(is_json_dict) - series = series[mask].apply(lambda x: json.loads(x)) - filtered_gdf = filtered_gdf[mask] - + series = filtered_gdf[subkeys[0]] for subkey in subkeys[1:]: + series = series.apply(safe_to_dict) mask = series.apply( - lambda x, subkey=subkey: isinstance(x, dict) and subkey in x + lambda x, sk=subkey: isinstance(x, dict) and sk in x ) - series = series[mask].apply(lambda x, subkey=subkey: x[subkey]) + series = series[mask].apply(lambda x, sk=subkey: x[sk]) filtered_gdf = filtered_gdf[mask] series = series.astype(str) @@ -91,11 +136,11 @@ def get_points(self, **kwargs: str) -> Iterable[Point]: yield geom -def is_json_dict(string: str) -> bool: - try: - valid_json = json.loads(string) - if isinstance(valid_json, dict): - return True - except ValueError: - return False - return False +def safe_to_dict(x: str | Any) -> Any: + """Safely converts potential JSON strings to dict, preserving existing dicts and NaNs.""" + if isinstance(x, str): + try: + return json.loads(x) + except (json.JSONDecodeError, TypeError): + return x + return x diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 03e4ce9..492863e 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -3,6 +3,7 @@ import io import json +import pandas as pd import pytest from ratiopath.parsers import ASAPParser, Darwin7JSONParser, GeoJSONParser @@ -141,6 +142,80 @@ def test_get_polygons_with_filters(self, geojson_content): polygons = list(parser.get_polygons(name="nonexistent")) assert len(polygons) == 0 + @pytest.fixture + def geojson_with_relations_content(self): + """Sample GeoJSON content with relations (definitions and annotations).""" + return { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": { + "type": "Polygon", + "coordinates": [ + [[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0], [0.0, 0.0]] + ], + }, + "properties": {"presetID": "a1", "shared_attr": "A"}, + }, + { + "type": "Feature", + "geometry": None, # Definition without geometry + "properties": { + "presetID": "a1", + "meta": { + "category": {"name": "Category", "value": "Healthy Tissue"} + }, + "shared_attr": "B", + }, + }, + { + "type": "Feature", + "geometry": { + "type": "Polygon", + "coordinates": [ + [[2.0, 2.0], [3.0, 2.0], [3.0, 3.0], [2.0, 3.0], [2.0, 2.0]] + ], + }, + "properties": {"presetID": "b2"}, + }, + ], + } + + def test_solve_relations_successful_merge(self, geojson_with_relations_content): + """Test resolving relations between annotations and definitions.""" + f = io.StringIO(json.dumps(geojson_with_relations_content)) + + parser = GeoJSONParser(f, join_key="presetID") + + assert len(parser.gdf) == 2 + assert parser.gdf.geometry.notna().all() + + target_row_a1 = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] + + raw_meta = target_row_a1["meta"] + meta_dict = json.loads(raw_meta) if isinstance(raw_meta, str) else raw_meta + + assert meta_dict["category"]["value"] == "Healthy Tissue" + + assert target_row_a1["shared_attr"] == "A" + assert target_row_a1["shared_attr_def"] == "B" + + target_row_b2 = parser.gdf[parser.gdf["presetID"] == "b2"].iloc[0] + assert pd.isna(target_row_b2.get("meta")) + assert pd.isna(target_row_b2.get("shared_attr_def")) + + def test_solve_relations_missing_join_key(self, geojson_with_relations_content): + """Test solve_relations behavior when the join key is missing.""" + f = io.StringIO(json.dumps(geojson_with_relations_content)) + + parser = GeoJSONParser(f, join_key="invalid_key") + + assert len(parser.gdf) == 2 + assert parser.gdf.geometry.notna().all() + + assert not any(col.endswith("_def") for col in parser.gdf.columns) + class TestDarwin7JSONParser: """Test the Darwin JSON parser.""" diff --git a/uv.lock b/uv.lock index 036cf54..8c0ed8a 100644 --- a/uv.lock +++ b/uv.lock @@ -291,7 +291,7 @@ wheels = [ [[package]] name = "geopandas" -version = "1.1.1" +version = "1.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -301,9 +301,9 @@ dependencies = [ { name = "pyproj" }, { name = "shapely" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8c/76/e1960ba846f153ab109575242abf89dc98f8e057faa32f3decf4cce9247a/geopandas-1.1.1.tar.gz", hash = "sha256:1745713f64d095c43e72e08e753dbd271678254b24f2e01db8cdb8debe1d293d", size = 332655, upload-time = "2025-06-26T21:04:56.57Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8d/24/5eb5685d7bf89d64218919379f882d19a60f8219d66d833c83b1cf264c95/geopandas-1.1.2.tar.gz", hash = "sha256:33f7b33565c46a45b8459a2ab699ec943fdbb5716e58e251b3c413cf7783106c", size = 336037, upload-time = "2025-12-22T21:06:13.749Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl", hash = "sha256:589e61aaf39b19828843df16cb90234e72897e2579be236f10eee0d052ad98e8", size = 338365, upload-time = "2025-06-26T21:04:55.139Z" }, + { url = "https://files.pythonhosted.org/packages/54/e4/fac19dc34cb686c96011388b813ff7b858a70681e5ce6ce7698e5021b0f4/geopandas-1.1.2-py3-none-any.whl", hash = "sha256:2bb0b1052cb47378addb4ba54c47f8d4642dcbda9b61375638274f49d9f0bb0d", size = 341734, upload-time = "2025-12-22T21:06:12.498Z" }, ] [[package]] @@ -1509,7 +1509,7 @@ tests = [ [package.metadata] requires-dist = [ { name = "albumentations", specifier = ">=2.0.8" }, - { name = "geopandas", specifier = ">=1.1.1" }, + { name = "geopandas", specifier = ">=1.1.2" }, { name = "imagecodecs", specifier = ">=2025.8.2" }, { name = "numpy", specifier = ">=2.2.2" }, { name = "openslide-python", specifier = ">=1.4.1" },