From 3d67ec530e9d671fdfbebca5688cb40b7f7dad50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Sat, 21 Feb 2026 22:17:59 +0100 Subject: [PATCH 01/28] feature: implement parsing of files with polygons and definitions seperated --- ratiopath/parsers/geojson_parser.py | 73 +++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 5252956..6d78194 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -1,8 +1,9 @@ import json from collections.abc import Iterable from pathlib import Path -from typing import TextIO +from typing import Any, TextIO +import pandas as pd import geopandas as gpd from geopandas import GeoDataFrame from shapely import Point, Polygon @@ -19,8 +20,17 @@ def __init__(self, file_path: Path | str | TextIO) -> None: self.gdf = gpd.read_file(file_path) if not self.gdf.empty: - # Explode Multi-part geometries to simplify geometry handling - self.gdf = self.gdf.explode(index_parts=True) + # Isolate definitions (no geometry) from physical annotations + mask = self.gdf.geometry.isna() | self.gdf.geometry.is_empty + definitions = self.gdf[mask] + annotations = self.gdf[~mask] + + if not annotations.empty: + annotations = annotations.explode(index_parts=True) + + self.gdf = gpd.GeoDataFrame(pd.concat([annotations, definitions], ignore_index=True), geometry="geometry") + + def get_filtered_geodataframe( self, separator: str = "_", **kwargs: str @@ -40,15 +50,19 @@ def get_filtered_geodataframe( for key, pattern in kwargs.items(): subkeys = key.split(separator) if not subkeys or subkeys[0] not in filtered_gdf.columns: - # If the first part of the key doesn't exist, return an empty frame - return self.gdf.iloc[0:0] + # If the first part of the key doesn't exist, return an empty frame with "geometry" column + return gpd.GeoDataFrame(self.gdf.iloc[0:0], geometry="geometry") - series = filtered_gdf[subkeys[0]].astype(str) + series = filtered_gdf[subkeys[0]] if len(subkeys) > 1: mask = series.apply(is_json_dict) - series = series[mask].apply(lambda x: json.loads(x)) + series = series[mask].apply(lambda x: json.loads(x) if isinstance(x, str) else x) filtered_gdf = filtered_gdf[mask] + # Protection against Pandas dropping all columns when applying masks to 0-row DataFrames + if filtered_gdf.empty: + return gpd.GeoDataFrame(filtered_gdf, geometry="geometry") + for subkey in subkeys[1:]: mask = series.apply( lambda x, subkey=subkey: isinstance(x, dict) and subkey in x @@ -56,11 +70,14 @@ def get_filtered_geodataframe( series = series[mask].apply(lambda x, subkey=subkey: x[subkey]) filtered_gdf = filtered_gdf[mask] + if filtered_gdf.empty: + return gpd.GeoDataFrame(filtered_gdf, geometry="geometry") + series = series.astype(str) mask = series.str.match(pattern, na=False) filtered_gdf = filtered_gdf[mask] - return filtered_gdf + return gpd.GeoDataFrame(filtered_gdf, geometry="geometry") def get_polygons(self, **kwargs: str) -> Iterable[Polygon]: """Get polygons from the GeoDataFrame. @@ -90,12 +107,38 @@ def get_points(self, **kwargs: str) -> Iterable[Point]: if isinstance(geom, Point): yield geom + def solve_relations(self, join_key: str) -> None: + """Merge properties from non-geometry features into geometry features based on a join key. -def is_json_dict(string: str) -> bool: - try: - valid_json = json.loads(string) - if isinstance(valid_json, dict): - return True - except ValueError: - return False + Args: + join_key: The column name used to link non-geometry definitions to geometry features. + """ + if join_key not in self.gdf.columns: + return + + is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty + definitions = self.gdf[is_empty_geom].drop(columns=["geometry"], errors="ignore").dropna(axis=1, how="all") + annotations = self.gdf[~is_empty_geom] + + if definitions.empty or annotations.empty: + return + + # Suffixes prevent naming conflicts; empty attributes in annotations become '_orig' + merged_df = annotations.merge( + definitions, + on=join_key, + how="left", + suffixes=("_orig", "") + ) + + self.gdf = gpd.GeoDataFrame(merged_df, geometry="geometry") + +def is_json_dict(obj: Any) -> bool: + if isinstance(obj, dict): + return True + if isinstance(obj, str): + try: + return isinstance(json.loads(obj), dict) + except json.JSONDecodeError: + return False return False From b29ba8121f22ff5f86b3d887658bf7ffae2f0e02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Sun, 22 Feb 2026 14:51:16 +0100 Subject: [PATCH 02/28] docs: add relational schema diagram to GeoJSONParser docstring --- ratiopath/parsers/geojson_parser.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 6d78194..3adeeb5 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -14,6 +14,22 @@ class GeoJSONParser: GeoJSON is a format for encoding geographic data structures using JSON. This parser supports both polygon and point geometries. + + Extended capabilities: + - Relational metadata integration: Maps properties from geometry-less definition + features to spatial annotation features via a shared join key (solve_relations). + + Expected relational schema for solve_relations: + FeatureCollection + ├── Feature (Definition) + │ ├── geometry: null + │ └── properties + │ ├── presetID: "a376..." <──────┐ (join_key) + │ └── meta: { "Category": "..." } │ + └── Feature (Annotation) │ + ├── geometry: { "type": "Polygon" } │ + └── properties │ + └── presetID: "a376..." <──────┘ """ def __init__(self, file_path: Path | str | TextIO) -> None: From e4cb76153d260698f8a0a1eb7fa24ac24e5f7ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 23 Feb 2026 22:57:04 +0100 Subject: [PATCH 03/28] chore: improve readability --- ratiopath/parsers/geojson_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 3adeeb5..4578a11 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -37,12 +37,12 @@ def __init__(self, file_path: Path | str | TextIO) -> None: if not self.gdf.empty: # Isolate definitions (no geometry) from physical annotations - mask = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - definitions = self.gdf[mask] - annotations = self.gdf[~mask] + has_null_geometry = self.gdf.geometry.isna() | self.gdf.geometry.is_empty + definitions = self.gdf[has_null_geometry] + annotations = self.gdf[~has_null_geometry] if not annotations.empty: - annotations = annotations.explode(index_parts=True) + annotations = annotations.explode(index_parts=True) # Decompose MultiPolygons into individual Shapely geometries self.gdf = gpd.GeoDataFrame(pd.concat([annotations, definitions], ignore_index=True), geometry="geometry") @@ -77,7 +77,7 @@ def get_filtered_geodataframe( # Protection against Pandas dropping all columns when applying masks to 0-row DataFrames if filtered_gdf.empty: - return gpd.GeoDataFrame(filtered_gdf, geometry="geometry") + return filtered_gdf for subkey in subkeys[1:]: mask = series.apply( @@ -87,13 +87,13 @@ def get_filtered_geodataframe( filtered_gdf = filtered_gdf[mask] if filtered_gdf.empty: - return gpd.GeoDataFrame(filtered_gdf, geometry="geometry") + return filtered_gdf series = series.astype(str) mask = series.str.match(pattern, na=False) filtered_gdf = filtered_gdf[mask] - return gpd.GeoDataFrame(filtered_gdf, geometry="geometry") + return filtered_gdf def get_polygons(self, **kwargs: str) -> Iterable[Polygon]: """Get polygons from the GeoDataFrame. From 7170bd70ae0c17f77888cf15bd7f2dfdee10bf71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 23 Feb 2026 22:58:21 +0100 Subject: [PATCH 04/28] fix: format --- ratiopath/parsers/geojson_parser.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 4578a11..8b17cd2 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -3,8 +3,8 @@ from pathlib import Path from typing import Any, TextIO -import pandas as pd import geopandas as gpd +import pandas as pd from geopandas import GeoDataFrame from shapely import Point, Polygon @@ -42,11 +42,14 @@ def __init__(self, file_path: Path | str | TextIO) -> None: annotations = self.gdf[~has_null_geometry] if not annotations.empty: - annotations = annotations.explode(index_parts=True) # Decompose MultiPolygons into individual Shapely geometries - - self.gdf = gpd.GeoDataFrame(pd.concat([annotations, definitions], ignore_index=True), geometry="geometry") - + annotations = annotations.explode( + index_parts=True + ) # Decompose MultiPolygons into individual Shapely geometries + self.gdf = gpd.GeoDataFrame( + pd.concat([annotations, definitions], ignore_index=True), + geometry="geometry", + ) def get_filtered_geodataframe( self, separator: str = "_", **kwargs: str @@ -72,7 +75,9 @@ def get_filtered_geodataframe( series = filtered_gdf[subkeys[0]] if len(subkeys) > 1: mask = series.apply(is_json_dict) - series = series[mask].apply(lambda x: json.loads(x) if isinstance(x, str) else x) + series = series[mask].apply( + lambda x: json.loads(x) if isinstance(x, str) else x + ) filtered_gdf = filtered_gdf[mask] # Protection against Pandas dropping all columns when applying masks to 0-row DataFrames @@ -133,7 +138,11 @@ def solve_relations(self, join_key: str) -> None: return is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - definitions = self.gdf[is_empty_geom].drop(columns=["geometry"], errors="ignore").dropna(axis=1, how="all") + definitions = ( + self.gdf[is_empty_geom] + .drop(columns=["geometry"], errors="ignore") + .dropna(axis=1, how="all") + ) annotations = self.gdf[~is_empty_geom] if definitions.empty or annotations.empty: @@ -141,14 +150,12 @@ def solve_relations(self, join_key: str) -> None: # Suffixes prevent naming conflicts; empty attributes in annotations become '_orig' merged_df = annotations.merge( - definitions, - on=join_key, - how="left", - suffixes=("_orig", "") + definitions, on=join_key, how="left", suffixes=("_orig", "") ) self.gdf = gpd.GeoDataFrame(merged_df, geometry="geometry") + def is_json_dict(obj: Any) -> bool: if isinstance(obj, dict): return True From 3269c635a9d84ae07e82883c796b3b81c8857974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 23 Feb 2026 23:42:28 +0100 Subject: [PATCH 05/28] fix: redundat dataframe creation, added documentation --- ratiopath/parsers/geojson_parser.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 8b17cd2..3d6f0a0 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -70,7 +70,7 @@ def get_filtered_geodataframe( subkeys = key.split(separator) if not subkeys or subkeys[0] not in filtered_gdf.columns: # If the first part of the key doesn't exist, return an empty frame with "geometry" column - return gpd.GeoDataFrame(self.gdf.iloc[0:0], geometry="geometry") + return self.gdf.iloc[0:0] series = filtered_gdf[subkeys[0]] if len(subkeys) > 1: @@ -131,6 +131,10 @@ def get_points(self, **kwargs: str) -> Iterable[Point]: def solve_relations(self, join_key: str) -> None: """Merge properties from non-geometry features into geometry features based on a join key. + Side effects: + - Non-geometry features (definitions) are permanently removed from self.gdf. + - Annotations without a matching definition receive NaN values for the imported attributes. + Args: join_key: The column name used to link non-geometry definitions to geometry features. """ @@ -138,19 +142,22 @@ def solve_relations(self, join_key: str) -> None: return is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - definitions = ( - self.gdf[is_empty_geom] - .drop(columns=["geometry"], errors="ignore") - .dropna(axis=1, how="all") - ) + definitions = self.gdf[is_empty_geom].drop(columns=["geometry"]).dropna(axis=1, how="all") annotations = self.gdf[~is_empty_geom] if definitions.empty or annotations.empty: return - # Suffixes prevent naming conflicts; empty attributes in annotations become '_orig' + if definitions[join_key].isna().all(): + return + + definitions = definitions.dropna(axis=1, how="all") + merged_df = annotations.merge( - definitions, on=join_key, how="left", suffixes=("_orig", "") + definitions, + on=join_key, + how="left", + suffixes=("_orig", "") ) self.gdf = gpd.GeoDataFrame(merged_df, geometry="geometry") From 1eb4c6b75563d6b19d40c48e685f602e7a9a6fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 23 Feb 2026 23:43:36 +0100 Subject: [PATCH 06/28] fix: format --- ratiopath/parsers/geojson_parser.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 3d6f0a0..4b55a10 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -142,7 +142,9 @@ def solve_relations(self, join_key: str) -> None: return is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - definitions = self.gdf[is_empty_geom].drop(columns=["geometry"]).dropna(axis=1, how="all") + definitions = ( + self.gdf[is_empty_geom].drop(columns=["geometry"]).dropna(axis=1, how="all") + ) annotations = self.gdf[~is_empty_geom] if definitions.empty or annotations.empty: @@ -154,10 +156,7 @@ def solve_relations(self, join_key: str) -> None: definitions = definitions.dropna(axis=1, how="all") merged_df = annotations.merge( - definitions, - on=join_key, - how="left", - suffixes=("_orig", "") + definitions, on=join_key, how="left", suffixes=("_orig", "") ) self.gdf = gpd.GeoDataFrame(merged_df, geometry="geometry") From 8393613f5617a93233ac8bada1d3aef4b62dd8ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 24 Feb 2026 23:14:05 +0100 Subject: [PATCH 07/28] feat: add relational geojson test --- tests/test_parsers.py | 68 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 7c0662f..90b6a37 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -140,3 +140,71 @@ def test_get_polygons_with_filters(self, geojson_content): polygons = list(parser.get_polygons(name="nonexistent")) assert len(polygons) == 0 + + @pytest.fixture + def geojson_with_relations_content(self): + """Sample GeoJSON content with relations (definitions and annotations).""" + return { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": { + "type": "Polygon", + "coordinates": [ + [[0.0, 0.0], [1.0, 0.0], [1.0, 1.0], [0.0, 1.0], [0.0, 0.0]] + ], + }, + "properties": {"presetID": "a1", "shared_attr": "A"}, + }, + { + "type": "Feature", + "geometry": None, # Definition without geometry + "properties": { + "presetID": "a1", + "category": "Tumor", + "shared_attr": "B", + }, + }, + { + "type": "Feature", + "geometry": { + "type": "Polygon", + "coordinates": [ + [[2.0, 2.0], [3.0, 2.0], [3.0, 3.0], [2.0, 3.0], [2.0, 2.0]] + ], + }, + "properties": {"presetID": "b2"}, + }, + ], + } + + def test_solve_relations_successful_merge(self, geojson_with_relations_content): + """Test resolving relations between annotations and definitions.""" + f = io.StringIO(json.dumps(geojson_with_relations_content)) + parser = GeoJSONParser(f) + + parser.solve_relations(join_key="presetID") + + # Definitions must be removed, only two annotations should remain + assert len(parser.gdf) == 2 + assert parser.gdf.geometry.notna().all() + + # Validation of merged data under key "a1" + tumor_row = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] + assert tumor_row["category"] == "Tumor" + + # Validation of collisions within columns + assert tumor_row["shared_attr_orig"] == "A" + assert tumor_row["shared_attr_def"] == "B" + + def test_solve_relations_missing_join_key(self, geojson_with_relations_content): + """Test solve_relations behavior when the join key is missing.""" + f = io.StringIO(json.dumps(geojson_with_relations_content)) + parser = GeoJSONParser(f) + + parser.solve_relations(join_key="invalid_key") + + assert len(parser.gdf) == 2 + assert parser.gdf.geometry.notna().all() + assert "category_def" not in parser.gdf.columns \ No newline at end of file From fb160f19ac926e376eb1b021f6ae6c3e099f1338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 24 Feb 2026 23:14:49 +0100 Subject: [PATCH 08/28] fix: format --- tests/test_parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 90b6a37..96d1fe0 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -207,4 +207,4 @@ def test_solve_relations_missing_join_key(self, geojson_with_relations_content): assert len(parser.gdf) == 2 assert parser.gdf.geometry.notna().all() - assert "category_def" not in parser.gdf.columns \ No newline at end of file + assert "category_def" not in parser.gdf.columns From d091b79f15d8fed1a428c3005599312c7bb2b1f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 24 Feb 2026 23:16:03 +0100 Subject: [PATCH 09/28] fix: change the filtering logic, improve safety of solving relations --- ratiopath/parsers/geojson_parser.py | 66 +++++++++++------------------ 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 4b55a10..4555695 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -1,7 +1,6 @@ -import json from collections.abc import Iterable from pathlib import Path -from typing import Any, TextIO +from typing import TextIO import geopandas as gpd import pandas as pd @@ -69,31 +68,17 @@ def get_filtered_geodataframe( for key, pattern in kwargs.items(): subkeys = key.split(separator) if not subkeys or subkeys[0] not in filtered_gdf.columns: - # If the first part of the key doesn't exist, return an empty frame with "geometry" column + # If the first part of the key doesn't exist, return an empty frame return self.gdf.iloc[0:0] series = filtered_gdf[subkeys[0]] - if len(subkeys) > 1: - mask = series.apply(is_json_dict) - series = series[mask].apply( - lambda x: json.loads(x) if isinstance(x, str) else x - ) - filtered_gdf = filtered_gdf[mask] - - # Protection against Pandas dropping all columns when applying masks to 0-row DataFrames - if filtered_gdf.empty: - return filtered_gdf - for subkey in subkeys[1:]: mask = series.apply( - lambda x, subkey=subkey: isinstance(x, dict) and subkey in x + lambda x, sk=subkey: isinstance(x, dict) and sk in x ) - series = series[mask].apply(lambda x, subkey=subkey: x[subkey]) + series = series[mask].apply(lambda x, sk=subkey: x[sk]) filtered_gdf = filtered_gdf[mask] - if filtered_gdf.empty: - return filtered_gdf - series = series.astype(str) mask = series.str.match(pattern, na=False) filtered_gdf = filtered_gdf[mask] @@ -138,36 +123,35 @@ def solve_relations(self, join_key: str) -> None: Args: join_key: The column name used to link non-geometry definitions to geometry features. """ + is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty + annotations = self.gdf[~is_empty_geom].copy() + if join_key not in self.gdf.columns: + self.gdf = annotations return - is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - definitions = ( - self.gdf[is_empty_geom].drop(columns=["geometry"]).dropna(axis=1, how="all") - ) - annotations = self.gdf[~is_empty_geom] + definitions = self.gdf[is_empty_geom].copy() if definitions.empty or annotations.empty: + self.gdf = annotations return - if definitions[join_key].isna().all(): + if definitions[join_key].isna().all() or annotations[join_key].isna().all(): + self.gdf = annotations return - definitions = definitions.dropna(axis=1, how="all") + if definitions[join_key].duplicated().any(): + raise ValueError(f"Duplicate definition for key '{join_key}' found.") - merged_df = annotations.merge( - definitions, on=join_key, how="left", suffixes=("_orig", "") + definitions = definitions.drop(columns=["geometry"], errors="ignore").dropna( + axis=1, how="all" + ) + annotations = annotations.dropna(axis=1, how="all") + + self.gdf = gpd.GeoDataFrame( + annotations.merge( + definitions, on=join_key, how="left", suffixes=("_orig", "_def") + ), + geometry="geometry", + crs=self.gdf.crs, ) - - self.gdf = gpd.GeoDataFrame(merged_df, geometry="geometry") - - -def is_json_dict(obj: Any) -> bool: - if isinstance(obj, dict): - return True - if isinstance(obj, str): - try: - return isinstance(json.loads(obj), dict) - except json.JSONDecodeError: - return False - return False From d70a1a82b8a6595daddd53f4bcc2f449e67737fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 24 Feb 2026 23:50:38 +0100 Subject: [PATCH 10/28] fix: use dict check --- ratiopath/parsers/geojson_parser.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 4555695..561448e 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -1,6 +1,7 @@ +import json from collections.abc import Iterable from pathlib import Path -from typing import TextIO +from typing import Any, TextIO import geopandas as gpd import pandas as pd @@ -73,6 +74,7 @@ def get_filtered_geodataframe( series = filtered_gdf[subkeys[0]] for subkey in subkeys[1:]: + series = series.apply(safe_to_dict) mask = series.apply( lambda x, sk=subkey: isinstance(x, dict) and sk in x ) @@ -155,3 +157,13 @@ def solve_relations(self, join_key: str) -> None: geometry="geometry", crs=self.gdf.crs, ) + + +def safe_to_dict(x: str | Any) -> Any: + """Safely converts potential JSON strings to dict, preserving existing dicts and NaNs.""" + if isinstance(x, str): + try: + return json.loads(x) + except (json.JSONDecodeError, TypeError): + return x + return x From 20cf512a0bdbf0b442a3e380fbe5a7b82a1e33d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 24 Feb 2026 23:55:05 +0100 Subject: [PATCH 11/28] fix: give crs for further processing --- ratiopath/parsers/geojson_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 561448e..76761e7 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -34,6 +34,7 @@ class GeoJSONParser: def __init__(self, file_path: Path | str | TextIO) -> None: self.gdf = gpd.read_file(file_path) + original_crs = self.gdf.crs if not self.gdf.empty: # Isolate definitions (no geometry) from physical annotations @@ -49,6 +50,7 @@ def __init__(self, file_path: Path | str | TextIO) -> None: self.gdf = gpd.GeoDataFrame( pd.concat([annotations, definitions], ignore_index=True), geometry="geometry", + crs=original_crs, ) def get_filtered_geodataframe( From 0e15845596a9754f51b6a05c635eed34ad50af0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Thu, 26 Feb 2026 14:56:34 +0100 Subject: [PATCH 12/28] fix: change the example path to tissue type --- ratiopath/parsers/geojson_parser.py | 2 +- tests/test_parsers.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 76761e7..28a7d35 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -25,7 +25,7 @@ class GeoJSONParser: │ ├── geometry: null │ └── properties │ ├── presetID: "a376..." <──────┐ (join_key) - │ └── meta: { "Category": "..." } │ + │ └── meta: { "category": { "name": "Category", "value": "Healthy Tissue" } } └── Feature (Annotation) │ ├── geometry: { "type": "Polygon" } │ └── properties │ diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 96d1fe0..06461a5 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -162,7 +162,7 @@ def geojson_with_relations_content(self): "geometry": None, # Definition without geometry "properties": { "presetID": "a1", - "category": "Tumor", + "meta": {"category": {"name": "Category", "value": "Healthy Tissue"}}, "shared_attr": "B", }, }, @@ -191,12 +191,12 @@ def test_solve_relations_successful_merge(self, geojson_with_relations_content): assert parser.gdf.geometry.notna().all() # Validation of merged data under key "a1" - tumor_row = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] - assert tumor_row["category"] == "Tumor" + target_row = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] + assert target_row["meta"]["category"]["value"] == "Healthy Tissue" # Validation of collisions within columns - assert tumor_row["shared_attr_orig"] == "A" - assert tumor_row["shared_attr_def"] == "B" + assert target_row["shared_attr_orig"] == "A" + assert target_row["shared_attr_def"] == "B" def test_solve_relations_missing_join_key(self, geojson_with_relations_content): """Test solve_relations behavior when the join key is missing.""" From b5bd8705755570157d82f71503c14c5f9749b9b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Thu, 26 Feb 2026 15:00:08 +0100 Subject: [PATCH 13/28] feat: add validation for annotation without definiton --- tests/test_parsers.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 06461a5..0269f79 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -2,6 +2,7 @@ import io import json +import pandas as pd import pytest @@ -190,13 +191,15 @@ def test_solve_relations_successful_merge(self, geojson_with_relations_content): assert len(parser.gdf) == 2 assert parser.gdf.geometry.notna().all() - # Validation of merged data under key "a1" - target_row = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] - assert target_row["meta"]["category"]["value"] == "Healthy Tissue" + target_row_a1 = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] + assert target_row_a1["meta"]["category"]["value"] == "Healthy Tissue" + assert target_row_a1["shared_attr_orig"] == "A" + assert target_row_a1["shared_attr_def"] == "B" - # Validation of collisions within columns - assert target_row["shared_attr_orig"] == "A" - assert target_row["shared_attr_def"] == "B" + target_row_b2 = parser.gdf[parser.gdf["presetID"] == "b2"].iloc[0] + assert pd.isna(target_row_b2.get("meta")) + assert pd.isna(target_row_b2.get("shared_attr_orig")) + assert pd.isna(target_row_b2.get("shared_attr_def")) def test_solve_relations_missing_join_key(self, geojson_with_relations_content): """Test solve_relations behavior when the join key is missing.""" From def9bcade66867a3ba18cb04c7e8b14b258ace79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Thu, 26 Feb 2026 15:01:25 +0100 Subject: [PATCH 14/28] chore: remove redundant variable --- ratiopath/parsers/geojson_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 28a7d35..295ebae 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -34,7 +34,6 @@ class GeoJSONParser: def __init__(self, file_path: Path | str | TextIO) -> None: self.gdf = gpd.read_file(file_path) - original_crs = self.gdf.crs if not self.gdf.empty: # Isolate definitions (no geometry) from physical annotations @@ -50,7 +49,7 @@ def __init__(self, file_path: Path | str | TextIO) -> None: self.gdf = gpd.GeoDataFrame( pd.concat([annotations, definitions], ignore_index=True), geometry="geometry", - crs=original_crs, + crs=self.gdf.crs, ) def get_filtered_geodataframe( From 6c8bd2981a99935792dffdb563c5ef3acdd7800b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Thu, 26 Feb 2026 15:01:52 +0100 Subject: [PATCH 15/28] fix: format --- tests/test_parsers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 0269f79..09498ba 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -2,8 +2,8 @@ import io import json -import pandas as pd +import pandas as pd import pytest from ratiopath.parsers import ASAPParser, GeoJSONParser @@ -163,7 +163,9 @@ def geojson_with_relations_content(self): "geometry": None, # Definition without geometry "properties": { "presetID": "a1", - "meta": {"category": {"name": "Category", "value": "Healthy Tissue"}}, + "meta": { + "category": {"name": "Category", "value": "Healthy Tissue"} + }, "shared_attr": "B", }, }, From 87395743f334f2c76bac9fcd14f70e96cca7f7d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Thu, 26 Feb 2026 15:11:20 +0100 Subject: [PATCH 16/28] fix: handle dynamic json parsing in geojson solve_relations --- tests/test_parsers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 09498ba..7721f4b 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -189,12 +189,15 @@ def test_solve_relations_successful_merge(self, geojson_with_relations_content): parser.solve_relations(join_key="presetID") - # Definitions must be removed, only two annotations should remain assert len(parser.gdf) == 2 assert parser.gdf.geometry.notna().all() target_row_a1 = parser.gdf[parser.gdf["presetID"] == "a1"].iloc[0] - assert target_row_a1["meta"]["category"]["value"] == "Healthy Tissue" + + raw_meta = target_row_a1["meta"] + meta_dict = json.loads(raw_meta) if isinstance(raw_meta, str) else raw_meta + + assert meta_dict["category"]["value"] == "Healthy Tissue" assert target_row_a1["shared_attr_orig"] == "A" assert target_row_a1["shared_attr_def"] == "B" From c25ed4d101ee4a7e6e14449622833e73f58c5f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Thu, 26 Feb 2026 15:19:46 +0100 Subject: [PATCH 17/28] refactor: dynamically assert the absence of any columns ending with _orig or _def --- tests/test_parsers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 7721f4b..e531bbf 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -215,4 +215,6 @@ def test_solve_relations_missing_join_key(self, geojson_with_relations_content): assert len(parser.gdf) == 2 assert parser.gdf.geometry.notna().all() - assert "category_def" not in parser.gdf.columns + + assert not any(col.endswith("_orig") for col in parser.gdf.columns) + assert not any(col.endswith("_def") for col in parser.gdf.columns) From 81aa4ac12ab3d143c83a2c4db44a0f8c257301dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 2 Mar 2026 15:20:55 +0100 Subject: [PATCH 18/28] fix: update geopandas version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5400891..dac47a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "shapely>=2.0.0", "torch>=2.6.0", "zarr>=3.1.1", - "geopandas>=1.1.1", + "geopandas>=1.1.2", "rasterio>=1.4.3", ] From 86a83e27d9cec6d0abcc9f813bafd74950d06385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 2 Mar 2026 15:53:06 +0100 Subject: [PATCH 19/28] fix: rewrite tests for the new private function --- tests/test_parsers.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index e531bbf..917bad2 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -185,9 +185,8 @@ def geojson_with_relations_content(self): def test_solve_relations_successful_merge(self, geojson_with_relations_content): """Test resolving relations between annotations and definitions.""" f = io.StringIO(json.dumps(geojson_with_relations_content)) - parser = GeoJSONParser(f) - parser.solve_relations(join_key="presetID") + parser = GeoJSONParser(f, join_key="presetID") assert len(parser.gdf) == 2 assert parser.gdf.geometry.notna().all() @@ -198,23 +197,21 @@ def test_solve_relations_successful_merge(self, geojson_with_relations_content): meta_dict = json.loads(raw_meta) if isinstance(raw_meta, str) else raw_meta assert meta_dict["category"]["value"] == "Healthy Tissue" - assert target_row_a1["shared_attr_orig"] == "A" + + assert target_row_a1["shared_attr"] == "A" assert target_row_a1["shared_attr_def"] == "B" target_row_b2 = parser.gdf[parser.gdf["presetID"] == "b2"].iloc[0] assert pd.isna(target_row_b2.get("meta")) - assert pd.isna(target_row_b2.get("shared_attr_orig")) assert pd.isna(target_row_b2.get("shared_attr_def")) def test_solve_relations_missing_join_key(self, geojson_with_relations_content): """Test solve_relations behavior when the join key is missing.""" f = io.StringIO(json.dumps(geojson_with_relations_content)) - parser = GeoJSONParser(f) - parser.solve_relations(join_key="invalid_key") + parser = GeoJSONParser(f, join_key="invalid_key") assert len(parser.gdf) == 2 assert parser.gdf.geometry.notna().all() - assert not any(col.endswith("_orig") for col in parser.gdf.columns) assert not any(col.endswith("_def") for col in parser.gdf.columns) From 9bb0a249d1c59bd9a9ff26d8c54789c7cae43cb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 2 Mar 2026 15:57:54 +0100 Subject: [PATCH 20/28] refactor: change the solve relations logic for merging definitions and annotations --- ratiopath/parsers/geojson_parser.py | 102 +++++++++++----------------- 1 file changed, 40 insertions(+), 62 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 295ebae..bcf73c1 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -4,7 +4,6 @@ from typing import Any, TextIO import geopandas as gpd -import pandas as pd from geopandas import GeoDataFrame from shapely import Point, Polygon @@ -32,25 +31,47 @@ class GeoJSONParser: └── presetID: "a376..." <──────┘ """ - def __init__(self, file_path: Path | str | TextIO) -> None: - self.gdf = gpd.read_file(file_path) - - if not self.gdf.empty: - # Isolate definitions (no geometry) from physical annotations - has_null_geometry = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - definitions = self.gdf[has_null_geometry] - annotations = self.gdf[~has_null_geometry] - - if not annotations.empty: - annotations = annotations.explode( - index_parts=True - ) # Decompose MultiPolygons into individual Shapely geometries + def __init__( + self, file_path: Path | str | TextIO, join_key: str | None = "presetID" + ) -> None: + gdf = gpd.read_file(file_path) + + if not gdf.empty: + has_geometry = ~(gdf.geometry.is_empty | gdf.geometry.isna()) + annotations = gdf[has_geometry].explode(index_parts=True) + definitions = gdf[~has_geometry] + + if join_key and join_key in gdf.columns and not definitions.empty: + self.gdf = self._solve_relations(annotations, definitions, join_key) + else: + self.gdf = annotations + else: + self.gdf = gdf + + @staticmethod + def _solve_relations( + annotations: GeoDataFrame, definitions: GeoDataFrame, join_key: str + ) -> GeoDataFrame: + """Merge definition properties into annotations using the join key. - self.gdf = gpd.GeoDataFrame( - pd.concat([annotations, definitions], ignore_index=True), - geometry="geometry", - crs=self.gdf.crs, - ) + Columns that exist only in the definitions are folded into the result. + Columns that exist in both get a ``_def`` suffix for the definition side. + """ + # Drop all-null columns from annotations so they don't shadow definition values + ann_null_cols = [ + c + for c in annotations.columns + if c != "geometry" and c != join_key and annotations[c].isna().all() + ] + annotations_clean = annotations.drop(columns=ann_null_cols) + + merged = annotations_clean.merge( + definitions.drop(columns=["geometry"]), + on=join_key, + how="left", + suffixes=("", "_def"), + ) + return merged def get_filtered_geodataframe( self, separator: str = "_", **kwargs: str @@ -116,49 +137,6 @@ def get_points(self, **kwargs: str) -> Iterable[Point]: if isinstance(geom, Point): yield geom - def solve_relations(self, join_key: str) -> None: - """Merge properties from non-geometry features into geometry features based on a join key. - - Side effects: - - Non-geometry features (definitions) are permanently removed from self.gdf. - - Annotations without a matching definition receive NaN values for the imported attributes. - - Args: - join_key: The column name used to link non-geometry definitions to geometry features. - """ - is_empty_geom = self.gdf.geometry.isna() | self.gdf.geometry.is_empty - annotations = self.gdf[~is_empty_geom].copy() - - if join_key not in self.gdf.columns: - self.gdf = annotations - return - - definitions = self.gdf[is_empty_geom].copy() - - if definitions.empty or annotations.empty: - self.gdf = annotations - return - - if definitions[join_key].isna().all() or annotations[join_key].isna().all(): - self.gdf = annotations - return - - if definitions[join_key].duplicated().any(): - raise ValueError(f"Duplicate definition for key '{join_key}' found.") - - definitions = definitions.drop(columns=["geometry"], errors="ignore").dropna( - axis=1, how="all" - ) - annotations = annotations.dropna(axis=1, how="all") - - self.gdf = gpd.GeoDataFrame( - annotations.merge( - definitions, on=join_key, how="left", suffixes=("_orig", "_def") - ), - geometry="geometry", - crs=self.gdf.crs, - ) - def safe_to_dict(x: str | Any) -> Any: """Safely converts potential JSON strings to dict, preserving existing dicts and NaNs.""" From 4e450d1423d6725da2eceafca89da3f31805f651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 2 Mar 2026 20:16:29 +0100 Subject: [PATCH 21/28] refactor: change init, make solve_relations private a clean up obsolete function --- ratiopath/parsers/geojson_parser.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index bcf73c1..e144e18 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -1,7 +1,6 @@ -import json from collections.abc import Iterable from pathlib import Path -from typing import Any, TextIO +from typing import TextIO import geopandas as gpd from geopandas import GeoDataFrame @@ -96,7 +95,6 @@ def get_filtered_geodataframe( series = filtered_gdf[subkeys[0]] for subkey in subkeys[1:]: - series = series.apply(safe_to_dict) mask = series.apply( lambda x, sk=subkey: isinstance(x, dict) and sk in x ) @@ -136,13 +134,3 @@ def get_points(self, **kwargs: str) -> Iterable[Point]: for geom in filtered_gdf.geometry: if isinstance(geom, Point): yield geom - - -def safe_to_dict(x: str | Any) -> Any: - """Safely converts potential JSON strings to dict, preserving existing dicts and NaNs.""" - if isinstance(x, str): - try: - return json.loads(x) - except (json.JSONDecodeError, TypeError): - return x - return x From 18d34e46a1a21e7ffbe4bac9dbe9af99fb1b582d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 2 Mar 2026 20:18:52 +0100 Subject: [PATCH 22/28] fix: resubmit uv.lock file --- uv.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/uv.lock b/uv.lock index 5af0930..0a6bade 100644 --- a/uv.lock +++ b/uv.lock @@ -291,7 +291,7 @@ wheels = [ [[package]] name = "geopandas" -version = "1.1.1" +version = "1.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -301,9 +301,9 @@ dependencies = [ { name = "pyproj" }, { name = "shapely" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8c/76/e1960ba846f153ab109575242abf89dc98f8e057faa32f3decf4cce9247a/geopandas-1.1.1.tar.gz", hash = "sha256:1745713f64d095c43e72e08e753dbd271678254b24f2e01db8cdb8debe1d293d", size = 332655, upload-time = "2025-06-26T21:04:56.57Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8d/24/5eb5685d7bf89d64218919379f882d19a60f8219d66d833c83b1cf264c95/geopandas-1.1.2.tar.gz", hash = "sha256:33f7b33565c46a45b8459a2ab699ec943fdbb5716e58e251b3c413cf7783106c", size = 336037, upload-time = "2025-12-22T21:06:13.749Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/70/d5cd0696eff08e62fdbdebe5b46527facb4e7220eabe0ac6225efab50168/geopandas-1.1.1-py3-none-any.whl", hash = "sha256:589e61aaf39b19828843df16cb90234e72897e2579be236f10eee0d052ad98e8", size = 338365, upload-time = "2025-06-26T21:04:55.139Z" }, + { url = "https://files.pythonhosted.org/packages/54/e4/fac19dc34cb686c96011388b813ff7b858a70681e5ce6ce7698e5021b0f4/geopandas-1.1.2-py3-none-any.whl", hash = "sha256:2bb0b1052cb47378addb4ba54c47f8d4642dcbda9b61375638274f49d9f0bb0d", size = 341734, upload-time = "2025-12-22T21:06:12.498Z" }, ] [[package]] @@ -1511,7 +1511,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "albumentations", specifier = ">=2.0.8" }, - { name = "geopandas", specifier = ">=1.1.1" }, + { name = "geopandas", specifier = ">=1.1.2" }, { name = "imagecodecs", specifier = ">=2025.8.2" }, { name = "mkdocs-material", marker = "extra == 'docs'", specifier = ">=9.6.18" }, { name = "mkdocstrings", extras = ["python"], marker = "extra == 'docs'", specifier = ">=0.30.0" }, From 9e85d776f163285e2733117f05e86c5c65728991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Mon, 2 Mar 2026 20:37:25 +0100 Subject: [PATCH 23/28] fix: revert last changes --- ratiopath/parsers/geojson_parser.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index e144e18..bcf73c1 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -1,6 +1,7 @@ +import json from collections.abc import Iterable from pathlib import Path -from typing import TextIO +from typing import Any, TextIO import geopandas as gpd from geopandas import GeoDataFrame @@ -95,6 +96,7 @@ def get_filtered_geodataframe( series = filtered_gdf[subkeys[0]] for subkey in subkeys[1:]: + series = series.apply(safe_to_dict) mask = series.apply( lambda x, sk=subkey: isinstance(x, dict) and sk in x ) @@ -134,3 +136,13 @@ def get_points(self, **kwargs: str) -> Iterable[Point]: for geom in filtered_gdf.geometry: if isinstance(geom, Point): yield geom + + +def safe_to_dict(x: str | Any) -> Any: + """Safely converts potential JSON strings to dict, preserving existing dicts and NaNs.""" + if isinstance(x, str): + try: + return json.loads(x) + except (json.JSONDecodeError, TypeError): + return x + return x From eeea45fafd1aa6bfd9c9e6e185924021ec214c41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Pek=C3=A1r?= Date: Tue, 3 Mar 2026 10:45:39 +0100 Subject: [PATCH 24/28] Update tests/test_parsers.py --- tests/test_parsers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index dc430ae..492863e 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -216,6 +216,7 @@ def test_solve_relations_missing_join_key(self, geojson_with_relations_content): assert not any(col.endswith("_def") for col in parser.gdf.columns) + class TestDarwin7JSONParser: """Test the Darwin JSON parser.""" From 9a2326335291bbe86c5cdd6ce3e1f05e68e5cd00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 3 Mar 2026 18:29:00 +0100 Subject: [PATCH 25/28] refactor: use a cleaner semantically equivalent code --- ratiopath/parsers/geojson_parser.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index bcf73c1..28341d7 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -34,19 +34,17 @@ class GeoJSONParser: def __init__( self, file_path: Path | str | TextIO, join_key: str | None = "presetID" ) -> None: - gdf = gpd.read_file(file_path) + self.gdf = gpd.read_file(file_path) - if not gdf.empty: - has_geometry = ~(gdf.geometry.is_empty | gdf.geometry.isna()) - annotations = gdf[has_geometry].explode(index_parts=True) - definitions = gdf[~has_geometry] + if not self.gdf.empty: + has_geometry = ~(self.gdf.geometry.is_empty | self.gdf.geometry.isna()) + annotations = self.gdf[has_geometry].explode(index_parts=True) + definitions = self.gdf[~has_geometry] - if join_key and join_key in gdf.columns and not definitions.empty: + if join_key in self.gdf.columns and not definitions.empty: self.gdf = self._solve_relations(annotations, definitions, join_key) else: self.gdf = annotations - else: - self.gdf = gdf @staticmethod def _solve_relations( From 8622bd63a570061a9d3b692e4af9d7a8307c4c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 3 Mar 2026 18:37:46 +0100 Subject: [PATCH 26/28] fix: add explicit type guard for join_key to satisfy mypy --- ratiopath/parsers/geojson_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 28341d7..065ef0d 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -41,7 +41,7 @@ def __init__( annotations = self.gdf[has_geometry].explode(index_parts=True) definitions = self.gdf[~has_geometry] - if join_key in self.gdf.columns and not definitions.empty: + if join_key is not None in self.gdf.columns and not definitions.empty: self.gdf = self._solve_relations(annotations, definitions, join_key) else: self.gdf = annotations From 0baa9700c3b9f54e0c7f11eb1c5ef9b626885682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 3 Mar 2026 18:48:47 +0100 Subject: [PATCH 27/28] fix: disable mypy --- ratiopath/parsers/geojson_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index 065ef0d..e8b8f0f 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -41,8 +41,8 @@ def __init__( annotations = self.gdf[has_geometry].explode(index_parts=True) definitions = self.gdf[~has_geometry] - if join_key is not None in self.gdf.columns and not definitions.empty: - self.gdf = self._solve_relations(annotations, definitions, join_key) + if join_key in self.gdf.columns and not definitions.empty: + self.gdf = self._solve_relations(annotations, definitions, join_key) # type: ignore[arg-type] else: self.gdf = annotations From ce1aa570cd5ccbd37d25bc9c4296993b3199e53d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Tue, 3 Mar 2026 18:49:13 +0100 Subject: [PATCH 28/28] fix: format --- ratiopath/parsers/geojson_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ratiopath/parsers/geojson_parser.py b/ratiopath/parsers/geojson_parser.py index e8b8f0f..1c75ad2 100644 --- a/ratiopath/parsers/geojson_parser.py +++ b/ratiopath/parsers/geojson_parser.py @@ -42,7 +42,7 @@ def __init__( definitions = self.gdf[~has_geometry] if join_key in self.gdf.columns and not definitions.empty: - self.gdf = self._solve_relations(annotations, definitions, join_key) # type: ignore[arg-type] + self.gdf = self._solve_relations(annotations, definitions, join_key) # type: ignore[arg-type] else: self.gdf = annotations