Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
3d67ec5
feature: implement parsing of files with polygons and definitions sep…
vojtech-cifka Feb 21, 2026
b29ba81
docs: add relational schema diagram to GeoJSONParser docstring
vojtech-cifka Feb 22, 2026
e4cb761
chore: improve readability
vojtech-cifka Feb 23, 2026
7170bd7
fix: format
vojtech-cifka Feb 23, 2026
3269c63
fix: redundat dataframe creation, added documentation
vojtech-cifka Feb 23, 2026
1eb4c6b
fix: format
vojtech-cifka Feb 23, 2026
8393613
feat: add relational geojson test
vojtech-cifka Feb 24, 2026
fb160f1
fix: format
vojtech-cifka Feb 24, 2026
d091b79
fix: change the filtering logic, improve safety of solving relations
vojtech-cifka Feb 24, 2026
d70a1a8
fix: use dict check
vojtech-cifka Feb 24, 2026
20cf512
fix: give crs for further processing
vojtech-cifka Feb 24, 2026
0e15845
fix: change the example path to tissue type
vojtech-cifka Feb 26, 2026
b5bd870
feat: add validation for annotation without definiton
vojtech-cifka Feb 26, 2026
def9bca
chore: remove redundant variable
vojtech-cifka Feb 26, 2026
6c8bd29
fix: format
vojtech-cifka Feb 26, 2026
8739574
fix: handle dynamic json parsing in geojson solve_relations
vojtech-cifka Feb 26, 2026
c25ed4d
refactor: dynamically assert the absence of any columns ending with _…
vojtech-cifka Feb 26, 2026
81aa4ac
fix: update geopandas version
vojtech-cifka Mar 2, 2026
86a83e2
fix: rewrite tests for the new private function
vojtech-cifka Mar 2, 2026
9bb0a24
refactor: change the solve relations logic for merging definitions an…
vojtech-cifka Mar 2, 2026
4e450d1
refactor: change init, make solve_relations private a clean up obsole…
vojtech-cifka Mar 2, 2026
18d34e4
fix: resubmit uv.lock file
vojtech-cifka Mar 2, 2026
9e85d77
fix: revert last changes
vojtech-cifka Mar 2, 2026
135fa88
Merge branch 'main' into feature/geojson-relational-parsing
matejpekar Mar 3, 2026
eeea45f
Update tests/test_parsers.py
matejpekar Mar 3, 2026
9a23263
refactor: use a cleaner semantically equivalent code
vojtech-cifka Mar 3, 2026
8622bd6
fix: add explicit type guard for join_key to satisfy mypy
vojtech-cifka Mar 3, 2026
0baa970
fix: disable mypy
vojtech-cifka Mar 3, 2026
ce1aa57
fix: format
vojtech-cifka Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies = [
"shapely>=2.0.0",
"torch>=2.6.0",
"zarr>=3.1.1",
"geopandas>=1.1.1",
"geopandas>=1.1.2",
"rasterio>=1.4.3",
]

Expand Down
85 changes: 65 additions & 20 deletions ratiopath/parsers/geojson_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
from collections.abc import Iterable
from pathlib import Path
from typing import TextIO
from typing import Any, TextIO

import geopandas as gpd
from geopandas import GeoDataFrame
Expand All @@ -13,14 +13,63 @@ class GeoJSONParser:

GeoJSON is a format for encoding geographic data structures using JSON.
This parser supports both polygon and point geometries.

Extended capabilities:
- Relational metadata integration: Maps properties from geometry-less definition
features to spatial annotation features via a shared join key (solve_relations).

Expected relational schema for solve_relations:
FeatureCollection
├── Feature (Definition)
│ ├── geometry: null
│ └── properties
│ ├── presetID: "a376..." <──────┐ (join_key)
│ └── meta: { "category": { "name": "Category", "value": "Healthy Tissue" } }
└── Feature (Annotation) │
├── geometry: { "type": "Polygon" } │
└── properties │
└── presetID: "a376..." <──────┘
"""

def __init__(self, file_path: Path | str | TextIO) -> None:
def __init__(
self, file_path: Path | str | TextIO, join_key: str | None = "presetID"
) -> None:
self.gdf = gpd.read_file(file_path)

if not self.gdf.empty:
# Explode Multi-part geometries to simplify geometry handling
self.gdf = self.gdf.explode(index_parts=True)
has_geometry = ~(self.gdf.geometry.is_empty | self.gdf.geometry.isna())
annotations = self.gdf[has_geometry].explode(index_parts=True)
definitions = self.gdf[~has_geometry]

if join_key in self.gdf.columns and not definitions.empty:
self.gdf = self._solve_relations(annotations, definitions, join_key) # type: ignore[arg-type]
else:
self.gdf = annotations

@staticmethod
def _solve_relations(
annotations: GeoDataFrame, definitions: GeoDataFrame, join_key: str
) -> GeoDataFrame:
"""Merge definition properties into annotations using the join key.

Columns that exist only in the definitions are folded into the result.
Columns that exist in both get a ``_def`` suffix for the definition side.
"""
# Drop all-null columns from annotations so they don't shadow definition values
ann_null_cols = [
c
for c in annotations.columns
if c != "geometry" and c != join_key and annotations[c].isna().all()
]
annotations_clean = annotations.drop(columns=ann_null_cols)

merged = annotations_clean.merge(
definitions.drop(columns=["geometry"]),
on=join_key,
how="left",
suffixes=("", "_def"),
)
return merged

def get_filtered_geodataframe(
self, separator: str = "_", **kwargs: str
Expand All @@ -43,17 +92,13 @@ def get_filtered_geodataframe(
# If the first part of the key doesn't exist, return an empty frame
return self.gdf.iloc[0:0]

series = filtered_gdf[subkeys[0]].astype(str)
if len(subkeys) > 1:
mask = series.apply(is_json_dict)
series = series[mask].apply(lambda x: json.loads(x))
filtered_gdf = filtered_gdf[mask]

series = filtered_gdf[subkeys[0]]
for subkey in subkeys[1:]:
series = series.apply(safe_to_dict)
mask = series.apply(
lambda x, subkey=subkey: isinstance(x, dict) and subkey in x
lambda x, sk=subkey: isinstance(x, dict) and sk in x
)
series = series[mask].apply(lambda x, subkey=subkey: x[subkey])
series = series[mask].apply(lambda x, sk=subkey: x[sk])
filtered_gdf = filtered_gdf[mask]

series = series.astype(str)
Expand Down Expand Up @@ -91,11 +136,11 @@ def get_points(self, **kwargs: str) -> Iterable[Point]:
yield geom


def is_json_dict(string: str) -> bool:
try:
valid_json = json.loads(string)
if isinstance(valid_json, dict):
return True
except ValueError:
return False
return False
def safe_to_dict(x: str | Any) -> Any:
"""Safely converts potential JSON strings to dict, preserving existing dicts and NaNs."""
if isinstance(x, str):
try:
return json.loads(x)
except (json.JSONDecodeError, TypeError):
return x
return x
75 changes: 75 additions & 0 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io
import json

import pandas as pd
import pytest

from ratiopath.parsers import ASAPParser, Darwin7JSONParser, GeoJSONParser
Expand Down Expand Up @@ -141,6 +142,80 @@ def test_get_polygons_with_filters(self, geojson_content):
polygons = list(parser.get_polygons(name="nonexistent"))
assert len(polygons) == 0

@pytest.fixture
def geojson_with_relations_content(self):
"""Sample GeoJSON content with relations (definitions and annotations)."""
return {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0], [0.0, 0.0]]
],
},
"properties": {"presetID": "a1", "shared_attr": "A"},
},
{
"type": "Feature",
"geometry": None, # Definition without geometry
"properties": {
"presetID": "a1",
"meta": {
"category": {"name": "Category", "value": "Healthy Tissue"}
},
"shared_attr": "B",
},
},
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[[2.0, 2.0], [3.0, 2.0], [3.0, 3.0], [2.0, 3.0], [2.0, 2.0]]
],
},
"properties": {"presetID": "b2"},
},
],
}

def test_solve_relations_successful_merge(self, geojson_with_relations_content):
"""Test resolving relations between annotations and definitions."""
f = io.StringIO(json.dumps(geojson_with_relations_content))

parser = GeoJSONParser(f, join_key="presetID")

assert len(parser.gdf) == 2
assert parser.gdf.geometry.notna().all()

target_row_a1 = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0]

raw_meta = target_row_a1["meta"]
meta_dict = json.loads(raw_meta) if isinstance(raw_meta, str) else raw_meta

assert meta_dict["category"]["value"] == "Healthy Tissue"

assert target_row_a1["shared_attr"] == "A"
assert target_row_a1["shared_attr_def"] == "B"

target_row_b2 = parser.gdf[parser.gdf["presetID"] == "b2"].iloc[0]
assert pd.isna(target_row_b2.get("meta"))
assert pd.isna(target_row_b2.get("shared_attr_def"))

def test_solve_relations_missing_join_key(self, geojson_with_relations_content):
"""Test solve_relations behavior when the join key is missing."""
f = io.StringIO(json.dumps(geojson_with_relations_content))

parser = GeoJSONParser(f, join_key="invalid_key")

assert len(parser.gdf) == 2
assert parser.gdf.geometry.notna().all()

assert not any(col.endswith("_def") for col in parser.gdf.columns)


class TestDarwin7JSONParser:
"""Test the Darwin JSON parser."""
Expand Down
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.