From 7e4293cd9661804af2acea6d315ce8732fe34a7c Mon Sep 17 00:00:00 2001 From: Manuel Bellersen Date: Wed, 20 Aug 2025 17:41:53 +0200 Subject: [PATCH] GH-53: refactor: Centralize relative dataframe-information-path and Migrate from pydantic dataclasses to BaseModel This commit refactors all data models from `pydantic.dataclasses.dataclass` to `pydantic.BaseModel`. This change provides a more idiomatic and powerful way to use Pydantic's features, such as `model_config` and `model_dump/validate`. Key changes include: - Replaced `@dataclass(frozen=True)` with `BaseModel` and `model_config = ConfigDict(frozen=True)`. - Simplified serialization/deserialization logic by replacing `TypeAdapter` with direct `Model.model_dump_json()` and `Model.model_validate()` calls. - Centralized path relativization logic for `DataFrameInformation` into a new `serialize/dataframe_information.py` module. This removes the need for custom `_serialize_model` methods and `path` fields on `ContainerInformation` and `PipelineInformation`, cleaning up the model definitions. - Renamed `ROLE_TYPE_ADAPTER_MAPPING` to `ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING` and now stores model types instead of `TypeAdapter` instances, simplifying validation logic. - Removed obsolete type alias files (`feature_sets.py`, `prediction_results.py`). --- src/getml_io/cli.py | 2 +- src/getml_io/getml/feature_learning.py | 30 +++-- src/getml_io/getml/predictors.py | 35 ++--- src/getml_io/getml/preprocessors.py | 40 +++--- src/getml_io/getml/project.py | 11 +- src/getml_io/getml/project_information.py | 9 +- src/getml_io/getml/roles.py | 8 +- .../metadata/container_information.py | 62 ++------- .../metadata/data_model_information.py | 8 +- .../metadata/dataframe_information.py | 58 ++++----- src/getml_io/metadata/feature_sets.py | 6 - src/getml_io/metadata/pipeline_information.py | 82 +----------- .../metadata/placeholder_information.py | 13 +- src/getml_io/metadata/prediction_results.py | 6 - src/getml_io/metadata/utils.py | 38 ------ src/getml_io/serialize/container.py | 35 +++-- .../serialize/container_information.py | 8 +- .../serialize/dataframe_information.py | 61 +++++++++ src/getml_io/serialize/dataframe_or_view.py | 37 ++---- src/getml_io/serialize/exception.py | 6 +- src/getml_io/serialize/pipeline.py | 78 ++++++------ .../serialize/pipeline_information.py | 8 +- src/getml_io/serialize/project.py | 4 +- tests/integration/assertions.py | 2 - tests/integration/data/cora/cora.py | 2 - tests/integration/data/datasets.py | 8 +- tests/integration/data/getmlproject.py | 13 +- tests/integration/data/loans/loans.py | 2 - tests/integration/data/numerical/numerical.py | 2 - tests/integration/data/robot/robot.py | 8 +- tests/integration/helpers.py | 18 +-- tests/unit/conftest.py | 120 +++++++++++------- tests/unit/getml/test_project.py | 6 +- .../metadata/test_container_information.py | 34 +++-- .../metadata/test_pipeline_information.py | 9 +- tests/unit/serialize/test_container.py | 66 ++++++---- .../serialize/test_container_information.py | 12 +- .../test_dataframe_information.py} | 24 +++- .../unit/serialize/test_dataframe_or_view.py | 7 +- tests/unit/serialize/test_pipeline.py | 63 +++++---- .../serialize/test_pipeline_information.py | 8 +- tests/unit/serialize/test_project.py | 11 +- 42 files changed, 525 insertions(+), 535 deletions(-) delete mode 100644 src/getml_io/metadata/feature_sets.py delete mode 100644 src/getml_io/metadata/prediction_results.py delete mode 100644 src/getml_io/metadata/utils.py create mode 100644 src/getml_io/serialize/dataframe_information.py rename tests/unit/{metadata/test_utils.py => serialize/test_dataframe_information.py} (62%) diff --git a/src/getml_io/cli.py b/src/getml_io/cli.py index 9f860a4..97fb473 100644 --- a/src/getml_io/cli.py +++ b/src/getml_io/cli.py @@ -153,7 +153,7 @@ def deserialize( pipeline_id=pipeline, container_id=container, ) - message = f"Deserializing {project_information} from {root_storage_directory}" + message = f"Deserializing {project_information!r} from {root_storage_directory}" # TODO @urfoex: #20: Implement deserialization logic raise NotImplementedError(message) diff --git a/src/getml_io/getml/feature_learning.py b/src/getml_io/getml/feature_learning.py index f45d113..329d9aa 100644 --- a/src/getml_io/getml/feature_learning.py +++ b/src/getml_io/getml/feature_learning.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Set as AbstractSet -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from getml.feature_learning.aggregations.types import ( FastPropAggregations, @@ -11,12 +11,12 @@ CrossEntropyLossType, SquareLossType, ) -from pydantic import Field -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict, Field -@dataclass(frozen=True) -class FastProp: +class FastProp(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + aggregation: AbstractSet[FastPropAggregations] delta_t: float loss_function: CrossEntropyLossType | SquareLossType | None @@ -31,8 +31,9 @@ class FastProp: type: Literal["fast_prop"] = "fast_prop" -@dataclass(frozen=True) -class Fastboost: +class Fastboost(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + gamma: float loss_function: CrossEntropyLossType | SquareLossType | None max_depth: int @@ -47,8 +48,9 @@ class Fastboost: type: Literal["fastboost"] = "fastboost" -@dataclass(frozen=True) -class Multirel: +class Multirel(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + aggregation: AbstractSet[MultirelAggregations] allow_sets: bool delta_t: float @@ -73,8 +75,9 @@ class Multirel: type: Literal["multirel"] = "multirel" -@dataclass(frozen=True) -class Relboost: +class Relboost(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + allow_null_weights: bool delta_t: float gamma: float @@ -95,8 +98,9 @@ class Relboost: type: Literal["relboost"] = "relboost" -@dataclass(frozen=True) -class RelMT: +class RelMT(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + allow_avg: bool delta_t: float gamma: float diff --git a/src/getml_io/getml/predictors.py b/src/getml_io/getml/predictors.py index 662cf2b..a557ad1 100644 --- a/src/getml_io/getml/predictors.py +++ b/src/getml_io/getml/predictors.py @@ -1,27 +1,29 @@ from __future__ import annotations -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal -from pydantic import Field -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict, Field -@dataclass(frozen=True) -class LinearRegression: +class LinearRegression(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + learning_rate: float reg_lambda: float type: Literal["linear_regression"] = "linear_regression" -@dataclass(frozen=True) -class LogisticRegression: +class LogisticRegression(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + learning_rate: float reg_lambda: float type: Literal["logistic_regression"] = "logistic_regression" -@dataclass(frozen=True) -class ScaleGBMClassifier: +class ScaleGBMClassifier(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + colsample_bylevel: float colsample_bytree: float early_stopping_rounds: int @@ -39,8 +41,9 @@ class ScaleGBMClassifier: type: Literal["scale_gbm_classifier"] = "scale_gbm_classifier" -@dataclass(frozen=True) -class ScaleGBMRegressor: +class ScaleGBMRegressor(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + colsample_bylevel: float colsample_bytree: float early_stopping_rounds: int @@ -58,8 +61,9 @@ class ScaleGBMRegressor: type: Literal["scale_gbm_regressor"] = "scale_gbm_regressor" -@dataclass(frozen=True) -class XGBoostClassifier: +class XGBoostClassifier(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + booster: str colsample_bylevel: float colsample_bytree: float @@ -86,8 +90,9 @@ class XGBoostClassifier: type: Literal["xgboost_classifier"] = "xgboost_classifier" -@dataclass(frozen=True) -class XGBoostRegressor: +class XGBoostRegressor(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + booster: str colsample_bylevel: float colsample_bytree: float diff --git a/src/getml_io/getml/preprocessors.py b/src/getml_io/getml/preprocessors.py index 09db575..56e9bc8 100644 --- a/src/getml_io/getml/preprocessors.py +++ b/src/getml_io/getml/preprocessors.py @@ -1,41 +1,45 @@ from __future__ import annotations from collections.abc import Set as AbstractSet -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from getml.feature_learning.aggregations.types import MappingAggregations -from pydantic import Field -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict, Field -@dataclass(frozen=True) -class CategoryTrimmer: +class CategoryTrimmer(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + max_num_categories: int min_freq: int type: Literal["category_trimmer"] = "category_trimmer" -@dataclass(frozen=True) -class EmailDomain: +class EmailDomain(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + type: Literal["email_domain"] = "email_domain" -@dataclass(frozen=True) -class Imputation: +class Imputation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + add_dummies: bool type: Literal["imputation"] = "imputation" -@dataclass(frozen=True) -class Mapping: +class Mapping(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + aggregation: AbstractSet[MappingAggregations] min_freq: int multithreading: bool type: Literal["mapping"] = "mapping" -@dataclass(frozen=True) -class Seasonal: +class Seasonal(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + disable_year: bool disable_month: bool disable_weekday: bool @@ -44,16 +48,18 @@ class Seasonal: type: Literal["seasonal"] = "seasonal" -@dataclass(frozen=True) -class Substring: +class Substring(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + begin: int length: int unit: str type: Literal["substring"] = "substring" -@dataclass(frozen=True) -class TextFieldSplitter: +class TextFieldSplitter(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + type: Literal["text_field_splitter"] = "text_field_splitter" diff --git a/src/getml_io/getml/project.py b/src/getml_io/getml/project.py index da885d1..5a336b9 100644 --- a/src/getml_io/getml/project.py +++ b/src/getml_io/getml/project.py @@ -1,10 +1,11 @@ import logging from collections.abc import Generator from contextlib import contextmanager -from dataclasses import dataclass +from typing import ClassVar from getml.data import Container from getml.pipeline import Pipeline +from pydantic import BaseModel, ConfigDict from getml_io.getml.exception import ( PipelineNotFoundError, @@ -24,8 +25,12 @@ logger: logging.Logger = logging.getLogger(__name__) -@dataclass -class Project: +class Project(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + name: str pipeline: Pipeline container: Container diff --git a/src/getml_io/getml/project_information.py b/src/getml_io/getml/project_information.py index f0ae3da..cd45415 100644 --- a/src/getml_io/getml/project_information.py +++ b/src/getml_io/getml/project_information.py @@ -1,8 +1,11 @@ -from dataclasses import dataclass +from typing import ClassVar +from pydantic import BaseModel, ConfigDict + + +class ProjectInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) -@dataclass -class ProjectInformation: project_name: str pipeline_id: str container_id: str diff --git a/src/getml_io/getml/roles.py b/src/getml_io/getml/roles.py index 44d6bce..36d4e2f 100644 --- a/src/getml_io/getml/roles.py +++ b/src/getml_io/getml/roles.py @@ -2,9 +2,10 @@ from collections.abc import Sequence from enum import Enum +from typing import ClassVar from getml.data import roles -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict class Role(str, Enum): @@ -18,8 +19,9 @@ class Role(str, Enum): UNUSED_STRING = roles.unused_string -@dataclass(frozen=True) -class Roles: +class Roles(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + categorical: Sequence[str] join_key: Sequence[str] numerical: Sequence[str] diff --git a/src/getml_io/metadata/container_information.py b/src/getml_io/metadata/container_information.py index 9a80835..6fbc64b 100644 --- a/src/getml_io/metadata/container_information.py +++ b/src/getml_io/metadata/container_information.py @@ -1,62 +1,20 @@ from __future__ import annotations -from collections.abc import Mapping -from pathlib import Path -from typing import Annotated +from typing import ClassVar -from pydantic import Field, model_serializer -from pydantic.dataclasses import dataclass -from typing_extensions import TypedDict +from pydantic import BaseModel, ConfigDict -from getml_io.metadata.dataframe_information import DataFrameInformation -from getml_io.metadata.utils import derive_instance_with_relative_path +from getml_io.metadata.dataframe_information import ( + DataFrameInformation, + DataFrameInformationByName, +) -class ContainerInformationDict(TypedDict): - id: str - population: DataFrameInformation | None - peripheral: Mapping[str, DataFrameInformation] - subsets: Mapping[str, DataFrameInformation] - deep_copy: bool +class ContainerInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) - -@dataclass -class ContainerInformation: id: str population: DataFrameInformation | None - peripheral: Mapping[str, DataFrameInformation] - subsets: Mapping[str, DataFrameInformation] + peripheral: DataFrameInformationByName + subsets: DataFrameInformationByName deep_copy: bool - path: Annotated[Path, Field(exclude=True)] - - # TODO @urfoex: #53 Adjust relative path already in serialization function - @model_serializer() - def _serialize_model( - self, - ) -> ContainerInformationDict: - return { - "id": self.id, - "population": self._create_dataframe_information_with_relative_path( - self.population, - ) - if self.population - else None, - "peripheral": { - name: self._create_dataframe_information_with_relative_path(peripheral) - for name, peripheral in self.peripheral.items() - }, - "subsets": { - name: self._create_dataframe_information_with_relative_path(subset) - for name, subset in self.subsets.items() - }, - "deep_copy": self.deep_copy, - } - - def _create_dataframe_information_with_relative_path( - self, - dataframe_information: DataFrameInformation, - ) -> DataFrameInformation: - return derive_instance_with_relative_path( - dataframe_information, - self.path, - ) diff --git a/src/getml_io/metadata/data_model_information.py b/src/getml_io/metadata/data_model_information.py index 25ef133..903d494 100644 --- a/src/getml_io/metadata/data_model_information.py +++ b/src/getml_io/metadata/data_model_information.py @@ -1,13 +1,15 @@ from __future__ import annotations from collections.abc import Mapping, Sequence +from typing import ClassVar -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict from getml_io.metadata.placeholder_information import PlaceholderInformation -@dataclass(frozen=True) -class DataModelInformation: +class DataModelInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + population: PlaceholderInformation peripheral: Mapping[str, Sequence[PlaceholderInformation]] diff --git a/src/getml_io/metadata/dataframe_information.py b/src/getml_io/metadata/dataframe_information.py index e82bd7f..5fa952c 100644 --- a/src/getml_io/metadata/dataframe_information.py +++ b/src/getml_io/metadata/dataframe_information.py @@ -3,16 +3,16 @@ from collections.abc import Mapping from datetime import datetime from pathlib import Path -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal -from pydantic import Field, TypeAdapter -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict, Field from getml_io.getml.roles import Role -@dataclass(frozen=True) -class ColumnStatisticsDouble: +class ColumnStatisticsDouble(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + count: int approx_unique: int avg: float @@ -26,8 +26,9 @@ class ColumnStatisticsDouble: column_type: Literal["DOUBLE"] -@dataclass(frozen=True) -class ColumnStatisticsVarchar: +class ColumnStatisticsVarchar(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + count: int approx_unique: int min: str @@ -36,18 +37,17 @@ class ColumnStatisticsVarchar: column_type: Literal["VARCHAR"] -@dataclass(frozen=True) class ColumnStatisticsNumerical(ColumnStatisticsDouble): type: Literal["numerical"] = "numerical" -@dataclass(frozen=True) class ColumnStatisticsTarget(ColumnStatisticsDouble): type: Literal["target"] = "target" -@dataclass(frozen=True) -class ColumnStatisticsTimeStamp: +class ColumnStatisticsTimeStamp(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + count: int approx_unique: int avg: datetime @@ -61,27 +61,22 @@ class ColumnStatisticsTimeStamp: type: Literal["time_stamp"] = "time_stamp" -@dataclass(frozen=True) class ColumnStatisticsTimeStampAsFloat(ColumnStatisticsDouble): type: Literal["time_stamp_float"] = "time_stamp_float" -@dataclass(frozen=True) class ColumnStatisticsCategorical(ColumnStatisticsVarchar): type: Literal["categorical"] = "categorical" -@dataclass(frozen=True) class ColumnStatisticsJoinKey(ColumnStatisticsVarchar): type: Literal["join_key"] = "join_key" -@dataclass(frozen=True) class ColumnStatisticsUnusedFloat(ColumnStatisticsDouble): type: Literal["unused_float"] = "unused_float" -@dataclass(frozen=True) class ColumnStatisticsUnusedString(ColumnStatisticsVarchar): type: Literal["unused_string"] = "unused_string" @@ -99,27 +94,32 @@ class ColumnStatisticsUnusedString(ColumnStatisticsVarchar): ] -@dataclass -class ColumnProfile: +class ColumnProfile(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + name: str role: Role statistics: ColumnStatistics -@dataclass -class DataFrameInformation: +class DataFrameInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + name: str path: Path column_profile: Mapping[str, ColumnProfile] -ROLE_TYPE_ADAPTER_MAPPING = { - (Role.CATEGORICAL, "VARCHAR"): TypeAdapter(ColumnStatisticsCategorical), - (Role.JOIN_KEY, "VARCHAR"): TypeAdapter(ColumnStatisticsJoinKey), - (Role.NUMERICAL, "DOUBLE"): TypeAdapter(ColumnStatisticsNumerical), - (Role.TARGET, "DOUBLE"): TypeAdapter(ColumnStatisticsTarget), - (Role.TIME_STAMP, "TIMESTAMP_NS"): TypeAdapter(ColumnStatisticsTimeStamp), - (Role.TIME_STAMP, "DOUBLE"): TypeAdapter(ColumnStatisticsTimeStampAsFloat), - (Role.UNUSED_FLOAT, "DOUBLE"): TypeAdapter(ColumnStatisticsUnusedFloat), - (Role.UNUSED_STRING, "VARCHAR"): TypeAdapter(ColumnStatisticsUnusedString), +DataFrameInformationByName = Mapping[str, DataFrameInformation] + + +ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING = { + (Role.CATEGORICAL, "VARCHAR"): ColumnStatisticsCategorical, + (Role.JOIN_KEY, "VARCHAR"): ColumnStatisticsJoinKey, + (Role.NUMERICAL, "DOUBLE"): ColumnStatisticsNumerical, + (Role.TARGET, "DOUBLE"): ColumnStatisticsTarget, + (Role.TIME_STAMP, "TIMESTAMP_NS"): ColumnStatisticsTimeStamp, + (Role.TIME_STAMP, "DOUBLE"): ColumnStatisticsTimeStampAsFloat, + (Role.UNUSED_FLOAT, "DOUBLE"): ColumnStatisticsUnusedFloat, + (Role.UNUSED_STRING, "VARCHAR"): ColumnStatisticsUnusedString, } diff --git a/src/getml_io/metadata/feature_sets.py b/src/getml_io/metadata/feature_sets.py deleted file mode 100644 index cb315e2..0000000 --- a/src/getml_io/metadata/feature_sets.py +++ /dev/null @@ -1,6 +0,0 @@ -from collections.abc import Mapping -from typing import TypeAlias - -from getml_io.metadata.dataframe_information import DataFrameInformation - -FeatureSets: TypeAlias = Mapping[str, DataFrameInformation] diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py index 541d109..fc6682c 100644 --- a/src/getml_io/metadata/pipeline_information.py +++ b/src/getml_io/metadata/pipeline_information.py @@ -2,28 +2,20 @@ from collections.abc import Sequence from enum import Enum -from pathlib import Path -from typing import Annotated +from typing import ClassVar from getml.feature_learning.loss_functions import ( CROSSENTROPYLOSS, SQUARELOSS, ) -from pydantic import ( - Field, - model_serializer, -) -from pydantic.dataclasses import dataclass -from typing_extensions import TypedDict +from pydantic import BaseModel, ConfigDict from getml_io.getml.feature_learning import FeatureLearner from getml_io.getml.predictors import FeatureSelector, Predictor from getml_io.getml.preprocessors import Preprocessor from getml_io.metadata.data_model_information import DataModelInformation -from getml_io.metadata.feature_sets import FeatureSets +from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.metadata.placeholder_information import PlaceholderInformation -from getml_io.metadata.prediction_results import PredictionResults -from getml_io.metadata.utils import derive_instance_with_relative_path class LossFunction(str, Enum): @@ -31,36 +23,12 @@ class LossFunction(str, Enum): SQUARE_LOSS = SQUARELOSS -class PipelineInformationDict(TypedDict): - id: str - predictions: PredictionResults - feature_sets: FeatureSets - feature_learners: Sequence[FeatureLearner] - feature_selectors: Sequence[FeatureSelector] - include_categorical: bool - is_classification: bool - is_regression: bool - loss_function: LossFunction - peripheral: Sequence[PlaceholderInformation] - predictors: Sequence[Predictor] - preprocessors: Sequence[Preprocessor] - share_selected_features: float - tags: Sequence[str] - targets: Sequence[str] - data_model: DataModelInformation - # features # TODO @urfoex: #17 - # scores # TODO @urfoex: #18 - # columns # TODO @urfoex: #50 - # metadata # TODO @urfoex: #51 - # tables # TODO @urfoex: #52 - +class PipelineInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) -@dataclass -class PipelineInformation: id: str - predictions: PredictionResults - feature_sets: FeatureSets - path: Annotated[Path, Field(exclude=True)] + predictions: DataFrameInformationByName + feature_sets: DataFrameInformationByName feature_learners: Sequence[FeatureLearner] feature_selectors: Sequence[FeatureSelector] include_categorical: bool @@ -79,39 +47,3 @@ class PipelineInformation: # columns # TODO @urfoex: #50 # metadata # TODO @urfoex: #51 # tables # TODO @urfoex: #52 - - # TODO @urfoex: #53 Adjust relative path already in serialization function - @model_serializer() - def _serialize_model(self) -> PipelineInformationDict: - return PipelineInformationDict( - { - "id": self.id, - "predictions": { - name: derive_instance_with_relative_path( - dataframe_information, - self.path, - ) - for name, dataframe_information in self.predictions.items() - }, - "feature_sets": { - name: derive_instance_with_relative_path( - dataframe_information, - self.path, - ) - for name, dataframe_information in self.feature_sets.items() - }, - "feature_learners": self.feature_learners, - "feature_selectors": self.feature_selectors, - "include_categorical": self.include_categorical, - "is_classification": self.is_classification, - "is_regression": self.is_regression, - "loss_function": self.loss_function, - "peripheral": self.peripheral, - "predictors": self.predictors, - "preprocessors": self.preprocessors, - "share_selected_features": self.share_selected_features, - "tags": self.tags, - "targets": self.targets, - "data_model": self.data_model, - }, - ) diff --git a/src/getml_io/metadata/placeholder_information.py b/src/getml_io/metadata/placeholder_information.py index 32fea54..735f4b2 100644 --- a/src/getml_io/metadata/placeholder_information.py +++ b/src/getml_io/metadata/placeholder_information.py @@ -1,15 +1,17 @@ from __future__ import annotations from collections.abc import Sequence +from typing import ClassVar -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict from getml_io.getml.relationships import Relationship from getml_io.getml.roles import Roles -@dataclass(frozen=True) -class JoinInformation: +class JoinInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + right: PlaceholderInformation on: Sequence[tuple[str, str]] | Sequence[tuple[None, None]] time_stamps: str | tuple[str, str] | None @@ -20,8 +22,9 @@ class JoinInformation: lagged_targets: bool | None -@dataclass(frozen=True) -class PlaceholderInformation: +class PlaceholderInformation(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + name: str roles: Roles joins: Sequence[JoinInformation] diff --git a/src/getml_io/metadata/prediction_results.py b/src/getml_io/metadata/prediction_results.py deleted file mode 100644 index 4debb21..0000000 --- a/src/getml_io/metadata/prediction_results.py +++ /dev/null @@ -1,6 +0,0 @@ -from collections.abc import Mapping -from typing import TypeAlias - -from getml_io.metadata.dataframe_information import DataFrameInformation - -PredictionResults: TypeAlias = Mapping[str, DataFrameInformation] diff --git a/src/getml_io/metadata/utils.py b/src/getml_io/metadata/utils.py deleted file mode 100644 index 3cbdc96..0000000 --- a/src/getml_io/metadata/utils.py +++ /dev/null @@ -1,38 +0,0 @@ -import dataclasses -from pathlib import Path - -from getml_io.metadata.dataframe_information import DataFrameInformation -from getml_io.metadata.exception import ( - DataFrameInformationPathNotRelativeError, -) - - -def derive_instance_with_relative_path( - dataframe_information: DataFrameInformation, - base_path: Path, -) -> DataFrameInformation: - """Derive a copy of an instance with a path relative to the given base path. - - Args: - dataframe_information: The instance to use as a template. - base_path: The base path to which the instance's path should be relative. - - Returns: - A new instance with the path relative to the base path. - - Raises: - DataFrameInformationPathNotRelativeError: If the instance's path cannot be made - relative to the base path. - - """ - try: - return dataclasses.replace( - dataframe_information, - path=dataframe_information.path.relative_to(base_path), - ) - except Exception as exception: - raise DataFrameInformationPathNotRelativeError( - dataframe_information.name, - dataframe_information.path, - base_path, - ) from exception diff --git a/src/getml_io/serialize/container.py b/src/getml_io/serialize/container.py index c2e576b..0af2bd8 100644 --- a/src/getml_io/serialize/container.py +++ b/src/getml_io/serialize/container.py @@ -7,8 +7,15 @@ from getml.data import Container from getml_io.metadata.container_information import ContainerInformation -from getml_io.metadata.dataframe_information import DataFrameInformation +from getml_io.metadata.dataframe_information import ( + DataFrameInformation, + DataFrameInformationByName, +) from getml_io.serialize.container_information import serialize_container_information +from getml_io.serialize.dataframe_information import ( + derive_instance_with_relative_path, + derive_instances_with_relative_path, +) from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view from getml_io.utils.convert import ( assume_is_bool, @@ -52,11 +59,21 @@ def serialize_container( ) container_information = ContainerInformation( id=assume_is_str(container.id), - population=population_information, - peripheral=peripheral_information, - subsets=subsets_information, + population=derive_instance_with_relative_path( + population_information, + target_storage_directory, + ) + if population_information + else None, + peripheral=derive_instances_with_relative_path( + peripheral_information, + target_storage_directory, + ), + subsets=derive_instances_with_relative_path( + subsets_information, + target_storage_directory, + ), deep_copy=assume_is_bool(container.deep_copy), - path=target_storage_directory, ) container_information_json_path = serialize_container_information( @@ -95,7 +112,7 @@ def serialize_population( def serialize_peripheral( container: Container, container_storage_directory: Path, -) -> dict[str, DataFrameInformation]: +) -> DataFrameInformationByName: """Serialize the peripherals of a container into the container storage directory. Args: @@ -104,7 +121,7 @@ def serialize_peripheral( will be saved. Returns: - dict[str, DataFrameInformation]: A dictionary mapping peripheral names to + DataFrameInformationByName: A dictionary mapping peripheral names to their serialized DataFrameInformation. """ @@ -123,7 +140,7 @@ def serialize_peripheral( def serialize_subsets( container: Container, container_storage_directory: Path, -) -> dict[str, DataFrameInformation]: +) -> DataFrameInformationByName: """Serialize the subsets of a container into the container storage directory. Args: @@ -132,7 +149,7 @@ def serialize_subsets( will be saved. Returns: - dict[str, DataFrameInformation]: A dictionary mapping subset names to + DataFrameInformationByName: A dictionary mapping subset names to their serialized DataFrameInformation. """ diff --git a/src/getml_io/serialize/container_information.py b/src/getml_io/serialize/container_information.py index 9f6a2b5..adee2f8 100644 --- a/src/getml_io/serialize/container_information.py +++ b/src/getml_io/serialize/container_information.py @@ -2,8 +2,6 @@ from pathlib import Path -from pydantic import TypeAdapter - from getml_io.metadata.container_information import ContainerInformation from getml_io.serialize.exception import ( ContainerInformationSerializationError, @@ -31,7 +29,7 @@ def serialize_container_information( """ try: - container_information_json = TypeAdapter(ContainerInformation).dump_json( + container_information_json = ContainerInformation.model_dump_json( container_information, indent=2, ) @@ -42,9 +40,7 @@ def serialize_container_information( container_json_path = target_storage_directory / "container.json" try: - _ = container_json_path.write_bytes( - container_information_json, - ) + _ = container_json_path.write_text(container_information_json) except Exception as exception: raise ContainerInformationStorageError( container_information.id, diff --git a/src/getml_io/serialize/dataframe_information.py b/src/getml_io/serialize/dataframe_information.py new file mode 100644 index 0000000..5e35592 --- /dev/null +++ b/src/getml_io/serialize/dataframe_information.py @@ -0,0 +1,61 @@ +from pathlib import Path + +from getml_io.metadata.dataframe_information import ( + DataFrameInformation, + DataFrameInformationByName, +) +from getml_io.metadata.exception import DataFrameInformationPathNotRelativeError + + +def derive_instance_with_relative_path( + dataframe_information: DataFrameInformation, + base_path: Path, +) -> DataFrameInformation: + """Derive a copy of an instance with a path relative to the given base path. + + Args: + dataframe_information: The instance to use as a template. + base_path: The base path to which the instance's path should be relative. + + Returns: + A new instance with the path relative to the base path. + + Raises: + DataFrameInformationPathNotRelativeError: If the instance's path cannot be made + relative to the base path. + + """ + try: + return dataframe_information.model_copy( + update={ + "path": dataframe_information.path.relative_to(base_path), + }, + ) + except Exception as exception: + raise DataFrameInformationPathNotRelativeError( + dataframe_information.name, + dataframe_information.path, + base_path, + ) from exception + + +def derive_instances_with_relative_path( + dataframe_information_by_name: DataFrameInformationByName, + base_path: Path, +) -> DataFrameInformationByName: + """Derive copies of instances with paths relative to the given base path. + + Args: + dataframe_information_by_name: A dictionary mapping names to + DataFrameInformation instances. + base_path: The base path to which the instances' paths should be relative. + + Returns: + A new dictionary with the same names, but with DataFrameInformation instances + having paths relative to the base path. + + """ + return { + name: derive_instance_with_relative_path(dataframe_information, base_path) + for name, dataframe_information in dataframe_information_by_name.items() + } diff --git a/src/getml_io/serialize/dataframe_or_view.py b/src/getml_io/serialize/dataframe_or_view.py index c84da94..3b31488 100644 --- a/src/getml_io/serialize/dataframe_or_view.py +++ b/src/getml_io/serialize/dataframe_or_view.py @@ -11,21 +11,12 @@ DataFrame, View, ) -from pydantic import TypeAdapter from getml_io.getml.roles import Role from getml_io.metadata.dataframe_information import ( - ROLE_TYPE_ADAPTER_MAPPING, + ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, ColumnProfile, ColumnStatistics, - ColumnStatisticsCategorical, - ColumnStatisticsJoinKey, - ColumnStatisticsNumerical, - ColumnStatisticsTarget, - ColumnStatisticsTimeStamp, - ColumnStatisticsTimeStampAsFloat, - ColumnStatisticsUnusedFloat, - ColumnStatisticsUnusedString, DataFrameInformation, ) from getml_io.serialize.exception import ( @@ -157,38 +148,32 @@ def _build_column_statistics( raw_summary_statistics: Mapping[str, Mapping[str, str | int | float]], ) -> dict[str, ColumnStatistics]: return { - name: _get_column_statistics_adapter( + name: _get_column_statistics_type( dataframe_or_view, name, assume_is_str(raw_summary_statistics[name]["column_type"]), - ).validate_python( + ).model_validate( raw_summary_statistics[name], ) for name in dataframe_or_view.columns } -def _get_column_statistics_adapter( +def _get_column_statistics_type( dataframe_or_view: DataFrame | View, name: str, column_type: str, -) -> ( - TypeAdapter[ColumnStatisticsNumerical] - | TypeAdapter[ColumnStatisticsTarget] - | TypeAdapter[ColumnStatisticsCategorical] - | TypeAdapter[ColumnStatisticsJoinKey] - | TypeAdapter[ColumnStatisticsTimeStamp] - | TypeAdapter[ColumnStatisticsTimeStampAsFloat] - | TypeAdapter[ColumnStatisticsUnusedFloat] - | TypeAdapter[ColumnStatisticsUnusedString] -): +) -> type[ColumnStatistics]: role = serialize_role(dataframe_or_view.roles.column(name)) - adapter = ROLE_TYPE_ADAPTER_MAPPING.get((role, column_type)) - if adapter is None: + column_statistics_type = ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING.get(( + role, + column_type, + )) + if column_statistics_type is None: raise UnsupportedColumnStatisticsError( assume_is_str(dataframe_or_view.name), name, role, column_type, ) - return adapter + return column_statistics_type diff --git a/src/getml_io/serialize/exception.py b/src/getml_io/serialize/exception.py index e61bdce..928e51e 100644 --- a/src/getml_io/serialize/exception.py +++ b/src/getml_io/serialize/exception.py @@ -1,7 +1,9 @@ from pathlib import Path from getml_io.getml.roles import Role -from getml_io.metadata.dataframe_information import ROLE_TYPE_ADAPTER_MAPPING +from getml_io.metadata.dataframe_information import ( + ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, +) from getml_io.utils.exception import GetMLIOError @@ -86,6 +88,6 @@ def __init__( message = ( f"Column {column_name!r} in dataframe {dataframe_name!r} has an " f"unsupported role: {role!r} and type: {column_type}. " - f"Supported are: {list(ROLE_TYPE_ADAPTER_MAPPING.keys())}." + f"Supported are: {list(ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING.keys())}." ) super().__init__(message) diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index 4a44146..89e0c49 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -13,7 +13,6 @@ ) from getml.pipeline import Pipeline from numpy.typing import NDArray -from pydantic import TypeAdapter from getml_io.getml.feature_learning import ( Fastboost, @@ -42,13 +41,13 @@ Substring, TextFieldSplitter, ) -from getml_io.metadata.feature_sets import FeatureSets +from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.metadata.pipeline_information import ( LossFunction, PipelineInformation, ) -from getml_io.metadata.prediction_results import PredictionResults from getml_io.serialize.data_model import serialize_data_model +from getml_io.serialize.dataframe_information import derive_instances_with_relative_path from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view from getml_io.serialize.pipeline_information import serialize_pipeline_information from getml_io.serialize.placeholder import serialize_placeholder @@ -79,17 +78,22 @@ def serialize_pipeline( pipeline_storage_directory = target_storage_directory / "pipeline" pipeline_information = PipelineInformation( id=pipeline.id, - predictions=serialize_predictions( - pipeline=pipeline, - container=container, - target_storage_directory=pipeline_storage_directory, + predictions=derive_instances_with_relative_path( + serialize_predictions( + pipeline=pipeline, + container=container, + target_storage_directory=pipeline_storage_directory, + ), + target_storage_directory, ), - feature_sets=serialize_feature_sets( - pipeline=pipeline, - container=container, - target_storage_directory=pipeline_storage_directory, + feature_sets=derive_instances_with_relative_path( + serialize_feature_sets( + pipeline=pipeline, + container=container, + target_storage_directory=pipeline_storage_directory, + ), + target_storage_directory, ), - path=target_storage_directory, feature_learners=[ serialize_feature_learner(feature_learner) for feature_learner in pipeline.feature_learners @@ -133,7 +137,7 @@ def serialize_predictions( pipeline: Pipeline, container: Container, target_storage_directory: Path, -) -> PredictionResults: +) -> DataFrameInformationByName: """Serialize the predictions created from all subsets of a Container. Args: @@ -143,12 +147,12 @@ def serialize_predictions( will be saved. Returns: - PredictionResults: A dictionary-like object containing the serialized + DataFrameInformationByName: A dictionary-like object containing the serialized predictions for each subset of the Container. """ predict_storage_directory = target_storage_directory / "predictions" - prediction_results: PredictionResults = {} + prediction_results: DataFrameInformationByName = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): prediction = cast( "NDArray[np.float64]", @@ -175,7 +179,7 @@ def serialize_feature_sets( pipeline: Pipeline, container: Container, target_storage_directory: Path, -) -> FeatureSets: +) -> DataFrameInformationByName: """Serialize the feature sets created from all subsets of a Container. Args: @@ -185,12 +189,12 @@ def serialize_feature_sets( will be saved. Returns: - FeatureSets: A dictionary-like object containing the serialized feature sets - for each subset of the Container. + DataFrameInformationByName: A dictionary-like object containing the serialized + feature sets for each subset of the Container. """ transform_storage_directory = target_storage_directory / "feature_sets" - feature_sets: FeatureSets = {} + feature_sets: DataFrameInformationByName = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): features = pipeline.transform( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] container[subset_name], @@ -224,15 +228,15 @@ def serialize_feature_learner( feature_learner_as_dict = dataclasses.asdict(feature_learner) match feature_learner: case getml_feature_learner.Fastboost(): - return TypeAdapter(Fastboost).validate_python(feature_learner_as_dict) + return Fastboost.model_validate(feature_learner_as_dict) case getml_feature_learner.FastProp(): - return TypeAdapter(FastProp).validate_python(feature_learner_as_dict) + return FastProp.model_validate(feature_learner_as_dict) case getml_feature_learner.Multirel(): - return TypeAdapter(Multirel).validate_python(feature_learner_as_dict) + return Multirel.model_validate(feature_learner_as_dict) case getml_feature_learner.Relboost(): - return TypeAdapter(Relboost).validate_python(feature_learner_as_dict) + return Relboost.model_validate(feature_learner_as_dict) case getml_feature_learner.RelMT(): - return TypeAdapter(RelMT).validate_python(feature_learner_as_dict) + return RelMT.model_validate(feature_learner_as_dict) def serialize_predictor( @@ -255,17 +259,17 @@ def serialize_predictor( predictor_as_dict = dataclasses.asdict(predictor) match predictor: case getml_predictor.LinearRegression(): - return TypeAdapter(LinearRegression).validate_python(predictor_as_dict) + return LinearRegression.model_validate(predictor_as_dict) case getml_predictor.LogisticRegression(): - return TypeAdapter(LogisticRegression).validate_python(predictor_as_dict) + return LogisticRegression.model_validate(predictor_as_dict) case getml_predictor.ScaleGBMClassifier(): - return TypeAdapter(ScaleGBMClassifier).validate_python(predictor_as_dict) + return ScaleGBMClassifier.model_validate(predictor_as_dict) case getml_predictor.ScaleGBMRegressor(): - return TypeAdapter(ScaleGBMRegressor).validate_python(predictor_as_dict) + return ScaleGBMRegressor.model_validate(predictor_as_dict) case getml_predictor.XGBoostClassifier(): - return TypeAdapter(XGBoostClassifier).validate_python(predictor_as_dict) + return XGBoostClassifier.model_validate(predictor_as_dict) case getml_predictor.XGBoostRegressor(): - return TypeAdapter(XGBoostRegressor).validate_python(predictor_as_dict) + return XGBoostRegressor.model_validate(predictor_as_dict) def serialize_preprocessor( # noqa: PLR0911 @@ -289,16 +293,16 @@ def serialize_preprocessor( # noqa: PLR0911 preprocessor_as_dict = dataclasses.asdict(preprocessor) match preprocessor: case getml_preprocessor.CategoryTrimmer(): - return TypeAdapter(CategoryTrimmer).validate_python(preprocessor_as_dict) + return CategoryTrimmer.model_validate(preprocessor_as_dict) case getml_preprocessor.EmailDomain(): - return TypeAdapter(EmailDomain).validate_python(preprocessor_as_dict) + return EmailDomain.model_validate(preprocessor_as_dict) case getml_preprocessor.Imputation(): - return TypeAdapter(Imputation).validate_python(preprocessor_as_dict) + return Imputation.model_validate(preprocessor_as_dict) case getml_preprocessor.Mapping(): - return TypeAdapter(Mapping).validate_python(preprocessor_as_dict) + return Mapping.model_validate(preprocessor_as_dict) case getml_preprocessor.Seasonal(): - return TypeAdapter(Seasonal).validate_python(preprocessor_as_dict) + return Seasonal.model_validate(preprocessor_as_dict) case getml_preprocessor.Substring(): - return TypeAdapter(Substring).validate_python(preprocessor_as_dict) + return Substring.model_validate(preprocessor_as_dict) case getml_preprocessor.TextFieldSplitter(): - return TypeAdapter(TextFieldSplitter).validate_python(preprocessor_as_dict) + return TextFieldSplitter.model_validate(preprocessor_as_dict) diff --git a/src/getml_io/serialize/pipeline_information.py b/src/getml_io/serialize/pipeline_information.py index c6a57c2..5716447 100644 --- a/src/getml_io/serialize/pipeline_information.py +++ b/src/getml_io/serialize/pipeline_information.py @@ -1,7 +1,5 @@ from pathlib import Path -from pydantic import TypeAdapter - from getml_io.metadata.pipeline_information import PipelineInformation from getml_io.serialize.exception import ( PipelineInformationSerializationError, @@ -31,7 +29,7 @@ def serialize_pipeline_information( """ try: - pipeline_information_json = TypeAdapter(PipelineInformation).dump_json( + pipeline_information_json = PipelineInformation.model_dump_json( pipeline_information, indent=2, ) @@ -41,9 +39,7 @@ def serialize_pipeline_information( ) from exception pipeline_json_path = target_storage_directory / "pipeline.json" try: - _ = pipeline_json_path.write_bytes( - pipeline_information_json, - ) + _ = pipeline_json_path.write_text(pipeline_information_json) except Exception as exception: raise PipelineInformationStorageError( pipeline_information.id, diff --git a/src/getml_io/serialize/project.py b/src/getml_io/serialize/project.py index df373f7..8aeb79f 100644 --- a/src/getml_io/serialize/project.py +++ b/src/getml_io/serialize/project.py @@ -45,7 +45,7 @@ def serialize_project( target_storage_directory, ) logger.info( - "Serialized Container to %s: %s", + "Serialized Container to %s: %r", container_information_json_path, container_information, ) @@ -56,7 +56,7 @@ def serialize_project( target_storage_directory, ) logger.info( - "Serialized Pipeline to %s: %s", + "Serialized Pipeline to %s: %r", pipeline_information_json_path, pipeline_information, ) diff --git a/tests/integration/assertions.py b/tests/integration/assertions.py index 1aad1a6..e87b62a 100644 --- a/tests/integration/assertions.py +++ b/tests/integration/assertions.py @@ -58,7 +58,6 @@ def assert_container_information( ) -> None: assert container_information.id assert container_information.deep_copy == expected_container_information.deep_copy - assert container_information.path == expected_container_information.path assert_dataframe_information( container_information.population, @@ -145,7 +144,6 @@ def assert_pipeline_information( expected_pipeline_information: PipelineInformation, ) -> None: assert pipeline_information.id - assert pipeline_information.path == expected_pipeline_information.path assert ( pipeline_information.feature_sets.keys() diff --git a/tests/integration/data/cora/cora.py b/tests/integration/data/cora/cora.py index 5f3d0e0..f0ab517 100644 --- a/tests/integration/data/cora/cora.py +++ b/tests/integration/data/cora/cora.py @@ -10,7 +10,6 @@ from getml.predictors import ( XGBoostClassifier, ) -from pydantic.dataclasses import dataclass from typing_extensions import override from tests.integration.data.datasets import DataSetName @@ -22,7 +21,6 @@ ) -@dataclass class CoraProject(GetMLProject): pass diff --git a/tests/integration/data/datasets.py b/tests/integration/data/datasets.py index df3a3bf..63d43ee 100644 --- a/tests/integration/data/datasets.py +++ b/tests/integration/data/datasets.py @@ -3,8 +3,9 @@ from collections.abc import Mapping, Sequence from enum import Enum from pathlib import Path +from typing import ClassVar -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict DATA_PATH: Path = Path(__file__).parent @@ -16,8 +17,9 @@ class DataSetName(str, Enum): NUMERICAL = "numerical" -@dataclass -class DataSet: +class DataSet(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict(frozen=True) + name: DataSetName population: Sequence[Path] peripheral: Sequence[Path] diff --git a/tests/integration/data/getmlproject.py b/tests/integration/data/getmlproject.py index 312cabf..5a7519c 100644 --- a/tests/integration/data/getmlproject.py +++ b/tests/integration/data/getmlproject.py @@ -6,14 +6,13 @@ from contextlib import contextmanager from itertools import chain from pathlib import Path -from typing import TypeVar +from typing import ClassVar, TypeVar import getml from filelock import FileLock from getml.data import Container, DataFrame, View from getml.pipeline import Pipeline -from pydantic import ConfigDict -from pydantic.dataclasses import dataclass +from pydantic import BaseModel, ConfigDict from tests.integration.data.datasets import DATASETS, DataSetName @@ -110,8 +109,12 @@ def _save_project_bundle(self) -> None: getml.project.save(filename=self._path) # pyright: ignore [reportUnknownMemberType] -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class GetMLProject: +class GetMLProject(BaseModel): + model_config: ClassVar[ConfigDict] = ConfigDict( + arbitrary_types_allowed=True, + frozen=True, + ) + name: str pipeline: Pipeline container: Container diff --git a/tests/integration/data/loans/loans.py b/tests/integration/data/loans/loans.py index b9055e9..f1f5d0e 100644 --- a/tests/integration/data/loans/loans.py +++ b/tests/integration/data/loans/loans.py @@ -10,7 +10,6 @@ from getml.predictors import ( XGBoostClassifier, ) -from pydantic.dataclasses import dataclass from typing_extensions import override from tests.integration.data.datasets import DataSetName @@ -22,7 +21,6 @@ ) -@dataclass class LoansProject(GetMLProject): pass diff --git a/tests/integration/data/numerical/numerical.py b/tests/integration/data/numerical/numerical.py index de60674..cdd5344 100644 --- a/tests/integration/data/numerical/numerical.py +++ b/tests/integration/data/numerical/numerical.py @@ -9,7 +9,6 @@ from getml.predictors import ( XGBoostRegressor, ) -from pydantic.dataclasses import dataclass from typing_extensions import override from getml_io.utils.convert import ( @@ -24,7 +23,6 @@ ) -@dataclass class NumericalProject(GetMLProject): pass diff --git a/tests/integration/data/robot/robot.py b/tests/integration/data/robot/robot.py index 15bd378..ab883c8 100644 --- a/tests/integration/data/robot/robot.py +++ b/tests/integration/data/robot/robot.py @@ -13,7 +13,6 @@ from getml.predictors import ( XGBoostRegressor, ) -from pydantic.dataclasses import dataclass from typing_extensions import override from tests.integration.data.datasets import DataSetName @@ -27,7 +26,6 @@ logger: logging.Logger = logging.getLogger(__name__) -@dataclass class RobotProject(GetMLProject): pass @@ -60,7 +58,11 @@ def create( container, ) container.save() - return RobotProject(name, pipeline, container) + return RobotProject( + name=name, + pipeline=pipeline, + container=container, + ) def _get_dataframe(self, dataset_name: DataSetName) -> DataFrame: container = self._load_getml_container(dataset_name) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index b8208f0..10af06d 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -1,14 +1,8 @@ -import json from pathlib import Path -from pydantic import TypeAdapter - from getml_io.metadata.container_information import ContainerInformation from getml_io.metadata.pipeline_information import PipelineInformation -ContainerInformationAdapter = TypeAdapter(ContainerInformation) -PipelineInformationAdapter = TypeAdapter(PipelineInformation) - def load_container_information( container_information_json_path: Path, @@ -16,9 +10,9 @@ def load_container_information( assert container_information_json_path.exists() assert container_information_json_path.is_file() - container_information_json = json.loads(container_information_json_path.read_text()) # pyright: ignore [reportAny] - container_information_json["path"] = Path("dummy.json") - return ContainerInformationAdapter.validate_python(container_information_json) + return ContainerInformation.model_validate_json( + container_information_json_path.read_text(), + ) def load_pipeline_information( @@ -27,6 +21,6 @@ def load_pipeline_information( assert pipeline_information_json_path.exists() assert pipeline_information_json_path.is_file() - pipeline_information_json = json.loads(pipeline_information_json_path.read_text()) # pyright: ignore [reportAny] - pipeline_information_json["path"] = Path("dummy.json") - return PipelineInformationAdapter.validate_python(pipeline_information_json) + return PipelineInformation.model_validate_json( + pipeline_information_json_path.read_text(), + ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 6b95f92..db81e0e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,5 +1,4 @@ import copy -import dataclasses import re from collections.abc import Mapping, Sequence from pathlib import Path @@ -128,14 +127,13 @@ def project_information() -> ProjectInformation: @pytest.fixture -def container_information_empty(tmp_path: Path) -> ContainerInformation: +def container_information_empty() -> ContainerInformation: return ContainerInformation( id="container_empty_id", population=None, peripheral={}, subsets={}, deep_copy=False, - path=tmp_path, ) @@ -160,26 +158,61 @@ def column_profile_default() -> ColumnProfile: ) +@pytest.fixture +def container_path() -> Path: + return Path("container") + + +@pytest.fixture +def population_path(container_path: Path) -> Path: + return container_path / "population" + + +@pytest.fixture +def peripheral_path(container_path: Path) -> Path: + return container_path / "peripheral" + + +@pytest.fixture +def subsets_path(container_path: Path) -> Path: + return container_path / "subsets" + + @pytest.fixture def dataframe_information_population( - tmp_path: Path, column_profile_default: ColumnProfile, + population_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="population", - path=tmp_path / "population.parquet", + path=population_path / "population.parquet", column_profile={"default": column_profile_default}, ) +@pytest.fixture +def pipeline_path() -> Path: + return Path("pipeline") + + +@pytest.fixture +def predictions_path(pipeline_path: Path) -> Path: + return pipeline_path / "predictions" + + +@pytest.fixture +def feature_sets_path(pipeline_path: Path) -> Path: + return pipeline_path / "feature_sets" + + @pytest.fixture def dataframe_information_peripheral( - tmp_path: Path, column_profile_default: ColumnProfile, + peripheral_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="peripheral", - path=tmp_path / "peripheral.parquet", + path=peripheral_path / "peripheral.parquet", column_profile={"default": column_profile_default}, ) @@ -197,8 +230,7 @@ def dataframe_information_subset( @pytest.fixture -def container_information( # noqa: PLR0913 - tmp_path: Path, +def container_information( dataframe_information_population: DataFrameInformation, dataframe_information_peripheral: DataFrameInformation, dataframe_information_train: DataFrameInformation, @@ -217,7 +249,6 @@ def container_information( # noqa: PLR0913 "validation": dataframe_information_validation, }, deep_copy=True, - path=tmp_path, ) @@ -255,36 +286,36 @@ def mock_dataframe_train(mock_dataframe: DataFrame) -> DataFrame: @pytest.fixture def dataframe_information_train( - tmp_path: Path, column_profile_default: ColumnProfile, + subsets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_train", - path=tmp_path / "dataframe_train.parquet", + path=subsets_path / "dataframe_train.parquet", column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_test( - tmp_path: Path, column_profile_default: ColumnProfile, + subsets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_test", - path=tmp_path / "dataframe_test.parquet", + path=subsets_path / "dataframe_test.parquet", column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_validation( - tmp_path: Path, column_profile_default: ColumnProfile, + subsets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_validation", - path=tmp_path / "dataframe_validation.parquet", + path=subsets_path / "dataframe_validation.parquet", column_profile={"default": column_profile_default}, ) @@ -351,14 +382,12 @@ def data_model_information_empty( @pytest.fixture def pipeline_information_empty( - tmp_path: Path, data_model_information_empty: DataModelInformation, ) -> PipelineInformation: return PipelineInformation( id="pipeline_empty_id", predictions={}, feature_sets={}, - path=tmp_path, feature_learners=[], feature_selectors=[], include_categorical=False, @@ -439,24 +468,24 @@ def mock_project_empty( @pytest.fixture def dataframe_information_features_test( - tmp_path: Path, column_profile_default: ColumnProfile, + feature_sets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="features.test", - path=tmp_path / "features.test.parquet", + path=feature_sets_path / "features.test.parquet", column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_features_validation( - tmp_path: Path, column_profile_default: ColumnProfile, + feature_sets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="features.validation", - path=tmp_path / "features.validation.parquet", + path=feature_sets_path / "features.validation.parquet", column_profile={"default": column_profile_default}, ) @@ -538,7 +567,6 @@ def category_trimmer() -> CategoryTrimmer: @pytest.fixture def pipeline_information( # noqa: PLR0913 - tmp_path: Path, dataframe_information_test: DataFrameInformation, dataframe_information_validation: DataFrameInformation, dataframe_information_features_test: DataFrameInformation, @@ -547,42 +575,38 @@ def pipeline_information( # noqa: PLR0913 fast_prop: FastProp, linear_regression: LinearRegression, category_trimmer: CategoryTrimmer, + predictions_path: Path, + feature_sets_path: Path, ) -> PipelineInformation: return PipelineInformation( id="pipeline_id", predictions={ - "test": dataclasses.replace( - dataframe_information_test, - path=tmp_path - / "pipeline" - / "predictions" - / dataframe_information_test.path.name, + "test": dataframe_information_test.model_copy( + update={ + "path": predictions_path / dataframe_information_test.path.name, + }, ), - "validation": dataclasses.replace( - dataframe_information_validation, - path=tmp_path - / "pipeline" - / "predictions" - / dataframe_information_validation.path.name, + "validation": dataframe_information_validation.model_copy( + update={ + "path": predictions_path + / dataframe_information_validation.path.name, + }, ), }, feature_sets={ - "test": dataclasses.replace( - dataframe_information_features_test, - path=tmp_path - / "pipeline" - / "feature_sets" - / dataframe_information_features_test.path.name, + "test": dataframe_information_features_test.model_copy( + update={ + "path": feature_sets_path + / dataframe_information_features_test.path.name, + }, ), - "validation": dataclasses.replace( - dataframe_information_features_validation, - path=tmp_path - / "pipeline" - / "feature_sets" - / dataframe_information_features_validation.path.name, + "validation": dataframe_information_features_validation.model_copy( + update={ + "path": feature_sets_path + / dataframe_information_features_validation.path.name, + }, ), }, - path=tmp_path, feature_learners=[fast_prop], feature_selectors=[linear_regression], include_categorical=False, diff --git a/tests/unit/getml/test_project.py b/tests/unit/getml/test_project.py index a2fb111..33f43a3 100644 --- a/tests/unit/getml/test_project.py +++ b/tests/unit/getml/test_project.py @@ -1,5 +1,7 @@ import pytest import pytest_mock +from getml.data import Container +from getml.pipeline import Pipeline from getml_io.getml.exception import ( PipelineNotFoundError, @@ -35,11 +37,11 @@ def test_load_project( ) mock_load_pipeline = mocker.patch( "getml.pipeline.load", - return_value=mocker.Mock(id="pipeline_id"), + return_value=mocker.Mock(spec=Pipeline, id="pipeline_id"), ) mock_load_container = mocker.patch( "getml.data.load_container", - return_value=mocker.Mock(id="container_id"), + return_value=mocker.Mock(spec=Container, id="container_id"), ) mock_suspend_project = mocker.patch( "getml.engine.suspend_project", diff --git a/tests/unit/metadata/test_container_information.py b/tests/unit/metadata/test_container_information.py index 0bbd005..8e69855 100644 --- a/tests/unit/metadata/test_container_information.py +++ b/tests/unit/metadata/test_container_information.py @@ -1,7 +1,6 @@ from pathlib import Path import pytest -from pydantic import TypeAdapter from getml_io.getml.roles import Role from getml_io.metadata.container_information import ContainerInformation @@ -13,9 +12,7 @@ def test_serialize_model_without_dataframe_information( container_information_empty: ContainerInformation, ) -> None: # When - serialized_model = TypeAdapter(ContainerInformation).dump_python( # pyright: ignore [reportAny] - container_information_empty, - ) + serialized_model = ContainerInformation.model_dump(container_information_empty) # Then expected_serialized_container_information = ( _get_expected_empty_container_information() @@ -36,20 +33,29 @@ def _get_expected_empty_container_information() -> ContainerInformationType: @pytest.mark.unit def test_serialize_model( container_information: ContainerInformation, + population_path: Path, + peripheral_path: Path, + subsets_path: Path, ) -> None: # When - serialized_model = TypeAdapter(ContainerInformation).dump_python( # pyright: ignore [reportAny] - container_information, - ) + serialized_model = ContainerInformation.model_dump(container_information) # Then expected_serialized_container_information = ( - _get_expected_serialized_container_information() + _get_expected_serialized_container_information( + population_path, + peripheral_path, + subsets_path, + ) ) assert serialized_model == expected_serialized_container_information -def _get_expected_serialized_container_information() -> ContainerInformationType: +def _get_expected_serialized_container_information( + population_path: Path, + peripheral_path: Path, + subsets_path: Path, +) -> ContainerInformationType: expected_column_profile: ColumnProfileType = { "default": { "name": "default", @@ -74,30 +80,30 @@ def _get_expected_serialized_container_information() -> ContainerInformationType "id": "container_id", "population": { "name": "population", - "path": Path("population.parquet"), + "path": population_path / "population.parquet", "column_profile": expected_column_profile, }, "peripheral": { "peripheral": { "name": "peripheral", - "path": Path("peripheral.parquet"), + "path": peripheral_path / "peripheral.parquet", "column_profile": expected_column_profile, }, }, "subsets": { "train": { "name": "dataframe_train", - "path": Path("dataframe_train.parquet"), + "path": subsets_path / "dataframe_train.parquet", "column_profile": expected_column_profile, }, "test": { "name": "dataframe_test", - "path": Path("dataframe_test.parquet"), + "path": subsets_path / "dataframe_test.parquet", "column_profile": expected_column_profile, }, "validation": { "name": "dataframe_validation", - "path": Path("dataframe_validation.parquet"), + "path": subsets_path / "dataframe_validation.parquet", "column_profile": expected_column_profile, }, }, diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index 7ec8e15..a3c713a 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -1,7 +1,6 @@ from pathlib import Path import pytest -from pydantic import TypeAdapter from getml_io.getml.relationships import Relationship from getml_io.getml.roles import Role @@ -14,9 +13,7 @@ def test_serialize_model_without_transforms( pipeline_information_empty: PipelineInformation, ) -> None: # When - serialized_model = TypeAdapter(PipelineInformation).dump_python( # pyright: ignore [reportAny] - pipeline_information_empty, - ) + serialized_model = PipelineInformation.model_dump(pipeline_information_empty) # Then expected_serialized_pipeline_information = ( @@ -68,9 +65,7 @@ def test_serialize_model( pipeline_information: PipelineInformation, ) -> None: # When - serialized_model = TypeAdapter(PipelineInformation).dump_python( # pyright: ignore [reportAny] - pipeline_information, - ) + serialized_model = PipelineInformation.model_dump(pipeline_information) # Then serialized_model["feature_learners"][0]["aggregation"] = list( diff --git a/tests/unit/serialize/test_container.py b/tests/unit/serialize/test_container.py index 24adb4c..6129f8a 100644 --- a/tests/unit/serialize/test_container.py +++ b/tests/unit/serialize/test_container.py @@ -14,30 +14,32 @@ @pytest.mark.unit -def test_serialize_container( +def test_serialize_container( # noqa: PLR0913 mock_container: Container, tmp_path: Path, mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + population_path: Path, + peripheral_path: Path, + subsets_path: Path, ) -> None: # Given + target_storage_directory = tmp_path population = assume_is_optional_dataframe_or_view(mock_container.population) mock_duckdb_execute_factory( { **( - {Path(f"container/population/{population.name}.parquet"): population} + {population_path / f"{population.name}.parquet": population} if population is not None else {} ), **{ - Path( - f"container/peripheral/{peripheral.name}.parquet", - ): peripheral + peripheral_path / f"{peripheral.name}.parquet": peripheral for peripheral in assume_is_dict_str_to_dataframe_or_view( mock_container.peripheral, ).values() }, **{ - Path(f"container/subsets/{subset_name}.{subset.name}.parquet"): subset + subsets_path / f"{subset_name}.{subset.name}.parquet": subset for subset_name, subset in assume_is_dict_str_to_dataframe_or_view( mock_container.subsets, ).items() @@ -45,16 +47,16 @@ def test_serialize_container( }, ) - mock_container.path = tmp_path - # When container_information_json_path, container_information = serialize_container( mock_container, - tmp_path, + target_storage_directory, ) # Then - expected_container_information_json_path = tmp_path / "container.json" + expected_container_information_json_path = ( + target_storage_directory / "container.json" + ) assert expected_container_information_json_path.exists() assert container_information_json_path == expected_container_information_json_path @@ -63,14 +65,14 @@ def test_serialize_container( assert container_information.population is not None assert container_information.population.name == "mock_population_name" assert container_information.population.path == ( - tmp_path / "container/population/mock_population_name.parquet" + population_path / "mock_population_name.parquet" ) assert len(container_information.peripheral) == 1 peripheral_information = container_information.peripheral["mock_peripheral_name"] assert peripheral_information.name == "mock_peripheral_name" assert peripheral_information.path == ( - tmp_path / "container/peripheral/mock_peripheral_name.parquet" + peripheral_path / "mock_peripheral_name.parquet" ) expected_number_of_subsets = 3 @@ -79,29 +81,35 @@ def test_serialize_container( train_information = container_information.subsets["train"] assert train_information.name == "mock_dataframe_train" assert train_information.path == ( - tmp_path / "container/subsets/train.mock_dataframe_train.parquet" + subsets_path / "train.mock_dataframe_train.parquet" ) test_information = container_information.subsets["test"] assert test_information.name == "mock_dataframe_test" - assert test_information.path == ( - tmp_path / "container/subsets/test.mock_dataframe_test.parquet" - ) + assert test_information.path == subsets_path / "test.mock_dataframe_test.parquet" validation_information = container_information.subsets["validation"] assert validation_information.name == "mock_dataframe_validation" assert validation_information.path == ( - tmp_path / "container/subsets/validation.mock_dataframe_validation.parquet" + subsets_path / "validation.mock_dataframe_validation.parquet" ) assert not container_information.deep_copy - expected_container_json_content = _get_expected_container_information() + expected_container_json_content = _get_expected_container_information( + population_path, + peripheral_path, + subsets_path, + ) container_json = expected_container_information_json_path.read_text() assert json.loads(container_json) == expected_container_json_content -def _get_expected_container_information() -> ContainerInformationType: +def _get_expected_container_information( + population_path: Path, + peripheral_path: Path, + subsets_path: Path, +) -> ContainerInformationType: expected_statistics_categorical = { "approx_unique": 0, "column_type": "VARCHAR", @@ -171,31 +179,31 @@ def _get_expected_container_information() -> ContainerInformationType: "id": "mock_container_id", "population": { "name": "mock_population_name", - "path": "container/population/mock_population_name.parquet", + "path": str(population_path / "mock_population_name.parquet"), "column_profile": expected_column_profile, }, "peripheral": { "mock_peripheral_name": { "name": "mock_peripheral_name", - "path": ("container/peripheral/mock_peripheral_name.parquet"), + "path": str(peripheral_path / "mock_peripheral_name.parquet"), "column_profile": expected_column_profile, }, }, "subsets": { "test": { "name": "mock_dataframe_test", - "path": "container/subsets/test.mock_dataframe_test.parquet", + "path": str(subsets_path / "test.mock_dataframe_test.parquet"), "column_profile": expected_column_profile, }, "train": { "name": "mock_dataframe_train", - "path": "container/subsets/train.mock_dataframe_train.parquet", + "path": str(subsets_path / "train.mock_dataframe_train.parquet"), "column_profile": expected_column_profile, }, "validation": { "name": "mock_dataframe_validation", - "path": ( - "container/subsets/validation.mock_dataframe_validation.parquet" + "path": str( + subsets_path / "validation.mock_dataframe_validation.parquet", ), "column_profile": expected_column_profile, }, @@ -210,16 +218,18 @@ def test_serialize_container_empty( tmp_path: Path, ) -> None: # Given - mock_container_empty.path = tmp_path + target_storage_directory = tmp_path # When container_information_json_path, container_information = serialize_container( mock_container_empty, - tmp_path, + target_storage_directory, ) # Then - expected_container_information_json_path = tmp_path / "container.json" + expected_container_information_json_path = ( + target_storage_directory / "container.json" + ) assert expected_container_information_json_path.exists() assert container_information_json_path == expected_container_information_json_path diff --git a/tests/unit/serialize/test_container_information.py b/tests/unit/serialize/test_container_information.py index 1b388d3..5d89c30 100644 --- a/tests/unit/serialize/test_container_information.py +++ b/tests/unit/serialize/test_container_information.py @@ -14,10 +14,11 @@ @pytest.mark.unit def test_serialize_container_information( + tmp_path: Path, container_information_empty: ContainerInformation, ) -> None: # Given - target_storage_directory = container_information_empty.path + target_storage_directory = tmp_path # When container_information_json_path = serialize_container_information( @@ -60,7 +61,7 @@ def test_serialize_container_information_serialization_error( ) -> None: # Given mock_dump_json = mocker.patch( - "getml_io.serialize.container_information.TypeAdapter.dump_json", + "getml_io.serialize.container_information.ContainerInformation.model_dump_json", side_effect=Exception("Serialization error"), ) @@ -88,9 +89,8 @@ def test_serialize_container_information_storage_error( ) -> None: # Given invalid_target_storage_directory = Path("/invalid/path") - container_information_empty.path = invalid_target_storage_directory - mock_write_bytes = mocker.patch( - "pathlib.Path.write_bytes", + mock_write_text = mocker.patch( + "pathlib.Path.write_text", side_effect=Exception("Storage error"), ) @@ -108,4 +108,4 @@ def test_serialize_container_information_storage_error( ) # Then - mock_write_bytes.assert_called_once() + mock_write_text.assert_called_once() diff --git a/tests/unit/metadata/test_utils.py b/tests/unit/serialize/test_dataframe_information.py similarity index 62% rename from tests/unit/metadata/test_utils.py rename to tests/unit/serialize/test_dataframe_information.py index 224070c..7143f83 100644 --- a/tests/unit/metadata/test_utils.py +++ b/tests/unit/serialize/test_dataframe_information.py @@ -6,7 +6,10 @@ from getml_io.metadata.exception import ( DataFrameInformationPathNotRelativeError, ) -from getml_io.metadata.utils import derive_instance_with_relative_path +from getml_io.serialize.dataframe_information import ( + derive_instance_with_relative_path, + derive_instances_with_relative_path, +) @pytest.mark.unit @@ -47,3 +50,22 @@ def test_derive_instance_with_relative_path_not_relative( dataframe_information, non_relative_path, ) + + +@pytest.mark.unit +def test_derive_instances_with_relative_path( + tmp_path: Path, + dataframe_information: DataFrameInformation, +) -> None: + # When + derived_instances_by_name = derive_instances_with_relative_path( + {dataframe_information.name: dataframe_information}, + tmp_path, + ) + + # Then + assert list(derived_instances_by_name.keys()) == [dataframe_information.name] + + derived_instance = derived_instances_by_name[dataframe_information.name] + assert isinstance(derived_instance, DataFrameInformation) + assert derived_instance.path == Path(dataframe_information.path.name) diff --git a/tests/unit/serialize/test_dataframe_or_view.py b/tests/unit/serialize/test_dataframe_or_view.py index 01e5f03..a5b8120 100644 --- a/tests/unit/serialize/test_dataframe_or_view.py +++ b/tests/unit/serialize/test_dataframe_or_view.py @@ -4,7 +4,6 @@ import pytest import pytest_mock from getml.data import DataFrame -from pydantic import TypeAdapter from getml_io.getml.roles import Role from getml_io.metadata.dataframe_information import ( @@ -20,8 +19,6 @@ from getml_io.utils.exception import StorageDirectoryCreationError from tests.unit.conftest import MockDuckDBExecuteFactory -ColumnProfileAdapter = TypeAdapter(ColumnProfile) - @pytest.mark.unit def test_serialize_dataframe_or_view( @@ -129,9 +126,9 @@ def _get_expected_column_profile() -> dict[str, ColumnProfile]: def column_profiles_to_json( column_profiles_by_name: Mapping[str, ColumnProfile], -) -> dict[str, bytes]: +) -> dict[str, str]: return { - column_name: ColumnProfileAdapter.dump_json(column_profile) + column_name: ColumnProfile.model_dump_json(column_profile) for column_name, column_profile in column_profiles_by_name.items() } diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index ad523c7..d30b45d 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -34,8 +34,7 @@ Substring, TextFieldSplitter, ) -from getml_io.metadata.feature_sets import FeatureSets -from getml_io.metadata.prediction_results import PredictionResults +from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.serialize.pipeline import ( serialize_feature_learner, serialize_feature_sets, @@ -55,35 +54,37 @@ def test_serialize_pipeline( # noqa: PLR0913 mock_duckdb_execute_factory: MockDuckDBExecuteFactory, mock_dataframe: DataFrame, mock_getml_dataframe_from_array: None, + feature_sets_path: Path, + predictions_path: Path, ) -> None: # Given _ = mock_getml_dataframe_from_array mock_duckdb_execute_factory( { - Path("pipeline/feature_sets/features.train.parquet"): mock_dataframe, - Path("pipeline/feature_sets/features.test.parquet"): mock_dataframe, - Path("pipeline/feature_sets/features.validation.parquet"): mock_dataframe, - Path("pipeline/predictions/prediction.train.parquet"): mock_dataframe, - Path("pipeline/predictions/prediction.test.parquet"): mock_dataframe, - Path("pipeline/predictions/prediction.validation.parquet"): mock_dataframe, + feature_sets_path / "features.train.parquet": mock_dataframe, + feature_sets_path / "features.test.parquet": mock_dataframe, + feature_sets_path / "features.validation.parquet": mock_dataframe, + predictions_path / "prediction.train.parquet": mock_dataframe, + predictions_path / "prediction.test.parquet": mock_dataframe, + predictions_path / "prediction.validation.parquet": mock_dataframe, }, ) + target_storage_directory = tmp_path # When pipeline_information_json_path, pipeline_information = serialize_pipeline( mock_pipeline, mock_container, - tmp_path, + target_storage_directory, ) # Then assert pipeline_information.id == mock_pipeline.id - assert pipeline_information.path == tmp_path - expected_pipeline_information_json_path = tmp_path / "pipeline.json" + expected_pipeline_information_json_path = target_storage_directory / "pipeline.json" assert pipeline_information_json_path == expected_pipeline_information_json_path - pipeline_path = tmp_path / "pipeline" + pipeline_path = Path("pipeline") # Then - feature sets for subset in ["train", "test", "validation"]: @@ -108,18 +109,20 @@ def test_serialize_pipeline_with_empty_outputs( mock_pipeline: Pipeline, mock_container_empty: Container, ) -> None: + # Given + target_storage_directory = tmp_path + # When pipeline_information_json_path, pipeline_information = serialize_pipeline( mock_pipeline, mock_container_empty, - tmp_path, + target_storage_directory, ) # Then assert pipeline_information.id == mock_pipeline.id - assert pipeline_information.path == tmp_path - expected_pipeline_information_json_path = tmp_path / "pipeline.json" + expected_pipeline_information_json_path = target_storage_directory / "pipeline.json" assert pipeline_information_json_path == expected_pipeline_information_json_path # Then - feature sets @@ -130,27 +133,29 @@ def test_serialize_pipeline_with_empty_outputs( @pytest.mark.unit -def test_serialize_feature_sets( +def test_serialize_feature_sets( # noqa: PLR0913 tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, mock_duckdb_execute_factory: MockDuckDBExecuteFactory, mock_dataframe: DataFrame, + feature_sets_path: Path, ) -> None: # Given mock_duckdb_execute_factory( { - Path("feature_sets/features.train.parquet"): mock_dataframe, - Path("feature_sets/features.test.parquet"): mock_dataframe, - Path("feature_sets/features.validation.parquet"): mock_dataframe, + feature_sets_path / "features.train.parquet": mock_dataframe, + feature_sets_path / "features.test.parquet": mock_dataframe, + feature_sets_path / "features.validation.parquet": mock_dataframe, }, ) + target_storage_directory = tmp_path / "pipeline" # When feature_sets = serialize_feature_sets( mock_pipeline, mock_container, - tmp_path, + target_storage_directory, ) # Then @@ -161,7 +166,7 @@ def test_serialize_feature_sets( _assert_features_valid( feature_sets, subset, - tmp_path / "feature_sets", + target_storage_directory / "feature_sets", ) @@ -173,22 +178,24 @@ def test_serialize_predictions( # noqa: PLR0913 mock_duckdb_execute_factory: MockDuckDBExecuteFactory, mock_dataframe: DataFrame, mock_getml_dataframe_from_array: None, + predictions_path: Path, ) -> None: # Given _ = mock_getml_dataframe_from_array mock_duckdb_execute_factory( { - Path("predictions/prediction.train.parquet"): mock_dataframe, - Path("predictions/prediction.test.parquet"): mock_dataframe, - Path("predictions/prediction.validation.parquet"): mock_dataframe, + predictions_path / "prediction.train.parquet": mock_dataframe, + predictions_path / "prediction.test.parquet": mock_dataframe, + predictions_path / "prediction.validation.parquet": mock_dataframe, }, ) + target_storage_directory = tmp_path / "pipeline" # When prediction_results = serialize_predictions( mock_pipeline, mock_container, - tmp_path, + target_storage_directory, ) # Then @@ -199,12 +206,12 @@ def test_serialize_predictions( # noqa: PLR0913 _assert_predictions_valid( prediction_results, subset, - tmp_path / "predictions", + target_storage_directory / "predictions", ) def _assert_predictions_valid( - predictions: PredictionResults, + predictions: DataFrameInformationByName, subset_name: str, path: Path, ) -> None: @@ -216,7 +223,7 @@ def _assert_predictions_valid( def _assert_features_valid( - features: FeatureSets, + features: DataFrameInformationByName, subset_name: str, path: Path, ) -> None: diff --git a/tests/unit/serialize/test_pipeline_information.py b/tests/unit/serialize/test_pipeline_information.py index ca9d8ed..7496797 100644 --- a/tests/unit/serialize/test_pipeline_information.py +++ b/tests/unit/serialize/test_pipeline_information.py @@ -86,7 +86,7 @@ def test_serialize_pipeline_information_serialization_error( ) -> None: # Given mock_dump_json = mocker.patch( - "getml_io.serialize.pipeline_information.TypeAdapter.dump_json", + "getml_io.serialize.pipeline_information.PipelineInformation.model_dump_json", side_effect=Exception("Serialization error."), ) @@ -114,8 +114,8 @@ def test_serialize_pipeline_information_storage_error( ) -> None: # Given target_storage_directory = Path("/invalid/storage/directory") - mock_write_bytes = mocker.patch( - "pathlib.Path.write_bytes", + mock_write_text = mocker.patch( + "pathlib.Path.write_text", side_effect=Exception("Storage error."), ) @@ -132,4 +132,4 @@ def test_serialize_pipeline_information_storage_error( target_storage_directory, ) - mock_write_bytes.assert_called() + mock_write_text.assert_called() diff --git a/tests/unit/serialize/test_project.py b/tests/unit/serialize/test_project.py index bc7f235..0114f64 100644 --- a/tests/unit/serialize/test_project.py +++ b/tests/unit/serialize/test_project.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest import pytest_mock @@ -9,15 +11,16 @@ @pytest.mark.unit -def test_serialize_project( +def test_serialize_project( # noqa: PLR0913 mocker: pytest_mock.MockerFixture, + tmp_path: Path, project_information: ProjectInformation, container_information_empty: ContainerInformation, pipeline_information_empty: PipelineInformation, mock_project_empty: Project, ) -> None: # Given - root_storage_directory = container_information_empty.path + root_storage_directory = tmp_path mock_create_target_storage_directory = mocker.patch( "getml_io.serialize.project.create_target_storage_directory", return_value=root_storage_directory, @@ -28,12 +31,12 @@ def test_serialize_project( ) mock_serialize_container = mocker.patch( "getml_io.serialize.project.serialize_container", - return_value=(container_information_empty.path, container_information_empty), + return_value=(tmp_path, container_information_empty), ) mock_serialize_pipeline = mocker.patch( "getml_io.serialize.project.serialize_pipeline", - return_value=(pipeline_information_empty.path, pipeline_information_empty), + return_value=(tmp_path, pipeline_information_empty), ) # When