From 5a9341c161f35157f4ff304fbc110affa37fc4c9 Mon Sep 17 00:00:00 2001 From: Manuel Bellersen Date: Wed, 10 Sep 2025 19:01:51 +0200 Subject: [PATCH] GH-56: refactor(serialize): Decouple serialization logic from disk I/O Refactor the serialization logic to decouple the process of extracting project metadata and data from the process of writing them to disk. This makes the serialization logic more flexible, testable, and enables in-memory inspection of project structures without any file I/O. The main change is that the `serialize_*` functions (e.g., `serialize_project`, `serialize_container`) now accept an optional `target_storage_directory` argument: - If a storage directory is provided, the project and its data artifacts are saved to disk as before. - If `target_storage_directory` is `None`, only the metadata `*Information` objects are generated and returned, with no disk I/O performed. This allows for a two-phase approach where project information can be gathered and analyzed in memory before deciding whether to persist it. Key changes include: - **Optional I/O:** `serialize_*` functions now conditionally perform disk I/O based on the presence of a `target_storage_directory`. - **Decoupled Metadata:** The `path` attribute has been removed from `DataFrameInformation`, removing the tight coupling between metadata and a physical file location. - **Clearer Naming:** `ProjectInformation` was renamed to `ProjectIdentification` to clarify its role in locating a project. Renamed `serialize_*` to `convert_*` for functions that only transform getML objects to pydantic models without I/O. - **Centralized I/O:** Parquet I/O and column statistics calculation have been centralized into dedicated modules (`serialize/column_information.py`, `serialize/parquet.py`). - **Improved Return Values:** `serialize_project` now consistently returns the `ProjectInformation` object, allowing for immediate in-memory use. - **Refactored Tests:** Tests have been updated to align with the new architecture, allowing for separate testing of the metadata generation and file I/O stages. --- pyproject.toml | 2 +- src/getml_io/cli.py | 16 +- ...formation.py => project_identification.py} | 2 +- src/getml_io/metadata/column_statistics.py | 110 ++++++++ .../metadata/dataframe_information.py | 109 +------- src/getml_io/metadata/exception.py | 44 --- src/getml_io/metadata/project_information.py | 9 + src/getml_io/serialize/column_information.py | 187 +++++++++++++ src/getml_io/serialize/container.py | 140 +++++----- .../serialize/container_information.py | 16 +- src/getml_io/serialize/data_model.py | 14 +- .../serialize/dataframe_information.py | 61 ----- src/getml_io/serialize/dataframe_or_view.py | 47 ++-- src/getml_io/serialize/exception.py | 2 +- src/getml_io/serialize/parquet.py | 205 ++++---------- src/getml_io/serialize/pipeline.py | 252 +++++++++--------- .../serialize/pipeline_information.py | 16 +- src/getml_io/serialize/placeholder.py | 14 +- src/getml_io/serialize/project.py | 75 +++--- src/getml_io/serialize/roles.py | 16 +- src/getml_io/serialize/table.py | 88 ++++++ src/getml_io/utils/duckdb.py | 95 +++++++ src/getml_io/utils/storage.py | 18 +- tests/integration/assertions.py | 3 +- .../data/loans/expected.container.json | 5 - .../data/loans/expected.pipeline.json | 6 +- .../data/numerical/expected.container.json | 3 - .../data/numerical/expected.pipeline.json | 6 +- .../data/robot/expected.container.json | 4 - .../data/robot/expected.pipeline.json | 8 +- tests/integration/test_serialize_cora.py | 12 + tests/integration/test_serialize_loans.py | 44 +++ tests/integration/test_serialize_numerical.py | 44 +++ tests/integration/test_serialize_robot.py | 44 +++ tests/unit/conftest.py | 131 +++------ .../metadata/test_container_information.py | 22 +- .../metadata/test_pipeline_information.py | 5 - .../unit/serialize/test_column_information.py | 217 +++++++++++++++ tests/unit/serialize/test_container.py | 87 ++---- .../serialize/test_dataframe_information.py | 71 ----- .../unit/serialize/test_dataframe_or_view.py | 64 +++-- tests/unit/serialize/test_parquet.py | 132 +-------- tests/unit/serialize/test_pipeline.py | 214 +++++++-------- tests/unit/serialize/test_project.py | 17 +- tests/unit/serialize/test_table.py | 88 ++++++ tests/unit/test_cli.py | 28 +- tests/unit/utils/test_storage.py | 34 +-- 47 files changed, 1592 insertions(+), 1235 deletions(-) rename src/getml_io/getml/{project_information.py => project_identification.py} (64%) create mode 100644 src/getml_io/metadata/column_statistics.py delete mode 100644 src/getml_io/metadata/exception.py create mode 100644 src/getml_io/metadata/project_information.py create mode 100644 src/getml_io/serialize/column_information.py delete mode 100644 src/getml_io/serialize/dataframe_information.py create mode 100644 src/getml_io/serialize/table.py create mode 100644 src/getml_io/utils/duckdb.py create mode 100644 tests/unit/serialize/test_column_information.py delete mode 100644 tests/unit/serialize/test_dataframe_information.py create mode 100644 tests/unit/serialize/test_table.py diff --git a/pyproject.toml b/pyproject.toml index 9041f50..4ee0f61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "uv_build" [project] name = "getml-io" -version = "0.1.0" +version = "0.2.0" description = "Library for serializing data and information from getML projects, pipelines, containers, dataframes, data-models and related components" readme = "README.md" authors = [ diff --git a/src/getml_io/cli.py b/src/getml_io/cli.py index 97fb473..16e42e6 100644 --- a/src/getml_io/cli.py +++ b/src/getml_io/cli.py @@ -6,7 +6,7 @@ import typer from typer import Option -from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.project_identification import ProjectIdentification from getml_io.serialize.project import serialize_project from getml_io.utils.storage import get_default_root_storage_directory @@ -82,7 +82,7 @@ def serialize( root_storage_directory: Annotated[ Path, Option( - help="Path to the directory where the serialized project will be saved", + help="Path to the directory where the serialized project will be saved.", prompt=False, show_default=True, ), @@ -97,14 +97,14 @@ def serialize( ] = False, ) -> None: """Serialize a getML project.""" - project_information: ProjectInformation = ProjectInformation( + project_identification = ProjectIdentification( project_name=project, pipeline_id=pipeline, container_id=container, ) - serialize_project( - project_information, - root_storage_directory, + _ = serialize_project( + project_identification=project_identification, + root_storage_directory=root_storage_directory, clear_storage_directory=clear_storage_directory, ) @@ -148,12 +148,12 @@ def deserialize( ] = DEFAULT_ROOT_STORAGE_DIRECTORY, ) -> None: """Deserialize a getML project.""" - project_information: ProjectInformation = ProjectInformation( + project_identification = ProjectIdentification( project_name=project, pipeline_id=pipeline, container_id=container, ) - message = f"Deserializing {project_information!r} from {root_storage_directory}" + message = f"Deserializing {project_identification!r} from {root_storage_directory}" # TODO @urfoex: #20: Implement deserialization logic raise NotImplementedError(message) diff --git a/src/getml_io/getml/project_information.py b/src/getml_io/getml/project_identification.py similarity index 64% rename from src/getml_io/getml/project_information.py rename to src/getml_io/getml/project_identification.py index 7e3a1ff..c74404f 100644 --- a/src/getml_io/getml/project_information.py +++ b/src/getml_io/getml/project_identification.py @@ -1,7 +1,7 @@ from pydantic import BaseModel -class ProjectInformation(BaseModel, frozen=True): +class ProjectIdentification(BaseModel, frozen=True): project_name: str pipeline_id: str container_id: str diff --git a/src/getml_io/metadata/column_statistics.py b/src/getml_io/metadata/column_statistics.py new file mode 100644 index 0000000..38b7a49 --- /dev/null +++ b/src/getml_io/metadata/column_statistics.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Annotated, Literal + +from pydantic import BaseModel, Field + +from getml_io.getml.roles import Role + + +class ColumnStatisticsBase(BaseModel, frozen=True): + count: int + approx_unique: int + null_percentage: float | None + + +class ColumnStatisticsDouble(ColumnStatisticsBase, frozen=True): + avg: float | None + min: float | None + max: float | None + q25: float | None + q50: float | None + q75: float | None + std: float | None + column_type: Literal["DOUBLE"] + + +class ColumnStatisticsVarchar(ColumnStatisticsBase, frozen=True): + min: str | None + max: str | None + column_type: Literal["VARCHAR"] + + +class ColumnStatisticsNumerical(ColumnStatisticsDouble, frozen=True): + type: Literal["numerical"] = "numerical" + + +class ColumnStatisticsTarget(ColumnStatisticsDouble, frozen=True): + type: Literal["target"] = "target" + + +class ColumnStatisticsTimeStamp(ColumnStatisticsBase, frozen=True): + avg: datetime | None + min: datetime | None + max: datetime | None + q25: datetime | None + q50: datetime | None + q75: datetime | None + column_type: Literal["TIMESTAMP_NS"] + type: Literal["time_stamp"] = "time_stamp" + + +class ColumnStatisticsTimeStampAsFloat(ColumnStatisticsDouble, frozen=True): + type: Literal["time_stamp_float"] = "time_stamp_float" + + +class ColumnStatisticsCategorical(ColumnStatisticsVarchar, frozen=True): + type: Literal["categorical"] = "categorical" + + +class ColumnStatisticsJoinKey(ColumnStatisticsVarchar, frozen=True): + type: Literal["join_key"] = "join_key" + + +class ColumnStatisticsText(ColumnStatisticsVarchar, frozen=True): + type: Literal["text"] = "text" + + +class ColumnStatisticsUnusedFloat(ColumnStatisticsDouble, frozen=True): + type: Literal["unused_float"] = "unused_float" + + +class ColumnStatisticsUnusedString(ColumnStatisticsVarchar, frozen=True): + type: Literal["unused_string"] = "unused_string" + + +ColumnStatistics = Annotated[ + ColumnStatisticsNumerical + | ColumnStatisticsTarget + | ColumnStatisticsCategorical + | ColumnStatisticsJoinKey + | ColumnStatisticsTimeStamp + | ColumnStatisticsTimeStampAsFloat + | ColumnStatisticsText + | ColumnStatisticsUnusedFloat + | ColumnStatisticsUnusedString, + Field(discriminator="type"), +] + + +class ColumnType(str, Enum): + """Column types supported by GetML-IO based on DuckDBs SUMMARIZE statistics.""" + + DOUBLE = "DOUBLE" + TIMESTAMP_NS = "TIMESTAMP_NS" + VARCHAR = "VARCHAR" + + +ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING = { + (Role.CATEGORICAL, ColumnType.VARCHAR): ColumnStatisticsCategorical, + (Role.JOIN_KEY, ColumnType.VARCHAR): ColumnStatisticsJoinKey, + (Role.NUMERICAL, ColumnType.DOUBLE): ColumnStatisticsNumerical, + (Role.TARGET, ColumnType.DOUBLE): ColumnStatisticsTarget, + (Role.TIME_STAMP, ColumnType.TIMESTAMP_NS): ColumnStatisticsTimeStamp, + (Role.TIME_STAMP, ColumnType.DOUBLE): ColumnStatisticsTimeStampAsFloat, + (Role.TEXT, ColumnType.VARCHAR): ColumnStatisticsText, + (Role.UNUSED_FLOAT, ColumnType.DOUBLE): ColumnStatisticsUnusedFloat, + (Role.UNUSED_STRING, ColumnType.VARCHAR): ColumnStatisticsUnusedString, +} diff --git a/src/getml_io/metadata/dataframe_information.py b/src/getml_io/metadata/dataframe_information.py index 76b6560..3542e9e 100644 --- a/src/getml_io/metadata/dataframe_information.py +++ b/src/getml_io/metadata/dataframe_information.py @@ -1,94 +1,11 @@ from __future__ import annotations from collections.abc import Mapping -from datetime import datetime -from enum import Enum -from pathlib import Path -from typing import Annotated, Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel from getml_io.getml.roles import Role - - -class ColumnStatisticsBase(BaseModel, frozen=True): - count: int - approx_unique: int - null_percentage: float | None - - -class ColumnStatisticsDouble(ColumnStatisticsBase, frozen=True): - avg: float | None - min: float | None - max: float | None - q25: float | None - q50: float | None - q75: float | None - std: float | None - column_type: Literal["DOUBLE"] - - -class ColumnStatisticsVarchar(ColumnStatisticsBase, frozen=True): - min: str | None - max: str | None - column_type: Literal["VARCHAR"] - - -class ColumnStatisticsNumerical(ColumnStatisticsDouble, frozen=True): - type: Literal["numerical"] = "numerical" - - -class ColumnStatisticsTarget(ColumnStatisticsDouble, frozen=True): - type: Literal["target"] = "target" - - -class ColumnStatisticsTimeStamp(ColumnStatisticsBase, frozen=True): - avg: datetime | None - min: datetime | None - max: datetime | None - q25: datetime | None - q50: datetime | None - q75: datetime | None - column_type: Literal["TIMESTAMP_NS"] - type: Literal["time_stamp"] = "time_stamp" - - -class ColumnStatisticsTimeStampAsFloat(ColumnStatisticsDouble, frozen=True): - type: Literal["time_stamp_float"] = "time_stamp_float" - - -class ColumnStatisticsCategorical(ColumnStatisticsVarchar, frozen=True): - type: Literal["categorical"] = "categorical" - - -class ColumnStatisticsJoinKey(ColumnStatisticsVarchar, frozen=True): - type: Literal["join_key"] = "join_key" - - -class ColumnStatisticsText(ColumnStatisticsVarchar, frozen=True): - type: Literal["text"] = "text" - - -class ColumnStatisticsUnusedFloat(ColumnStatisticsDouble, frozen=True): - type: Literal["unused_float"] = "unused_float" - - -class ColumnStatisticsUnusedString(ColumnStatisticsVarchar, frozen=True): - type: Literal["unused_string"] = "unused_string" - - -ColumnStatistics = Annotated[ - ColumnStatisticsNumerical - | ColumnStatisticsTarget - | ColumnStatisticsCategorical - | ColumnStatisticsJoinKey - | ColumnStatisticsTimeStamp - | ColumnStatisticsTimeStampAsFloat - | ColumnStatisticsText - | ColumnStatisticsUnusedFloat - | ColumnStatisticsUnusedString, - Field(discriminator="type"), -] +from getml_io.metadata.column_statistics import ColumnStatistics class ColumnInformation(BaseModel, frozen=True): @@ -99,29 +16,7 @@ class ColumnInformation(BaseModel, frozen=True): class DataFrameInformation(BaseModel, frozen=True): name: str - path: Path columns: Mapping[str, ColumnInformation] DataFrameInformationByName = Mapping[str, DataFrameInformation] - - -class ColumnType(str, Enum): - """Column types supported by GetML-IO based on DuckDBs SUMMARIZE statistics.""" - - DOUBLE = "DOUBLE" - TIMESTAMP_NS = "TIMESTAMP_NS" - VARCHAR = "VARCHAR" - - -ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING = { - (Role.CATEGORICAL, ColumnType.VARCHAR): ColumnStatisticsCategorical, - (Role.JOIN_KEY, ColumnType.VARCHAR): ColumnStatisticsJoinKey, - (Role.NUMERICAL, ColumnType.DOUBLE): ColumnStatisticsNumerical, - (Role.TARGET, ColumnType.DOUBLE): ColumnStatisticsTarget, - (Role.TIME_STAMP, ColumnType.TIMESTAMP_NS): ColumnStatisticsTimeStamp, - (Role.TIME_STAMP, ColumnType.DOUBLE): ColumnStatisticsTimeStampAsFloat, - (Role.TEXT, ColumnType.VARCHAR): ColumnStatisticsText, - (Role.UNUSED_FLOAT, ColumnType.DOUBLE): ColumnStatisticsUnusedFloat, - (Role.UNUSED_STRING, ColumnType.VARCHAR): ColumnStatisticsUnusedString, -} diff --git a/src/getml_io/metadata/exception.py b/src/getml_io/metadata/exception.py deleted file mode 100644 index 903c27c..0000000 --- a/src/getml_io/metadata/exception.py +++ /dev/null @@ -1,44 +0,0 @@ -from pathlib import Path - -from getml_io.utils.exception import GetMLIOError - - -class PathNotRelativeError(GetMLIOError): - """Exception raised when a path is not relative to a base path.""" - - def __init__( - self, - item_type: str, - item_name: str, - path: Path, - base_path: Path, - ) -> None: - """Initialize the exception with a custom message.""" - message = ( - f"'{item_type}' with name '{item_name}' and path '{path}' " - f"is not relative to base path '{base_path}'. " - "Ensure that the path is correctly set relative to the base path." - ) - super().__init__(message) - - -class DataFrameInformationPathNotRelativeError(PathNotRelativeError): - """Exception raised on erroneous DataFrameInformation path. - - Raised when the path of a DataFrameInformation is not relative - to the given base path. - """ - - def __init__( - self, - name: str, - path: Path, - base_path: Path, - ) -> None: - """Initialize the exception with a custom message.""" - super().__init__( - item_type="DataFrameInformation", - item_name=name, - path=path, - base_path=base_path, - ) diff --git a/src/getml_io/metadata/project_information.py b/src/getml_io/metadata/project_information.py new file mode 100644 index 0000000..5c40a54 --- /dev/null +++ b/src/getml_io/metadata/project_information.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from getml_io.metadata.container_information import ContainerInformation +from getml_io.metadata.pipeline_information import PipelineInformation + + +class ProjectInformation(BaseModel, frozen=True): + container_information: ContainerInformation + pipeline_information: PipelineInformation diff --git a/src/getml_io/serialize/column_information.py b/src/getml_io/serialize/column_information.py new file mode 100644 index 0000000..b89c12c --- /dev/null +++ b/src/getml_io/serialize/column_information.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import logging +from collections.abc import Callable, Mapping, Sequence +from logging import Logger +from pathlib import Path + +import pyarrow as pa +from getml.data import DataFrame, View +from getml.data.roles.types import Role as GetMLRole + +from getml_io.getml.roles import Role +from getml_io.metadata.column_statistics import ( + ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, + ColumnStatistics, + ColumnType, +) +from getml_io.metadata.dataframe_information import ( + ColumnInformation, +) +from getml_io.serialize.exception import ( + UnsupportedColumnStatisticsError, +) +from getml_io.serialize.roles import convert_role +from getml_io.utils.convert import assume_is_str +from getml_io.utils.duckdb import ( + SummaryStatisticsType, + fetch_raw_summary_statistics_for_dataframe, + fetch_raw_summary_statistics_for_table, # pyright: ignore [reportUnknownVariableType] + fetch_raw_summary_statistics_from_parquet, +) + +logger: Logger = logging.getLogger(__name__) + + +ColumnInformationByName = Mapping[str, ColumnInformation] + + +def build_column_information_by_name_for_dataframe( + dataframe_or_view: DataFrame | View, +) -> ColumnInformationByName: + """Build column information for each column in a getML DataFrame or View. + + Args: + dataframe_or_view: A getML DataFrame or View. + + Returns: + A mapping from column names to their corresponding ColumnInformation. + + """ + raw_summary_statistics = fetch_raw_summary_statistics_for_dataframe( + dataframe_or_view, + ) + summary_statistics = _build_column_statistics_by_name( + dataframe_name=assume_is_str(dataframe_or_view.name), + get_getml_role_by_column=dataframe_or_view.roles.column, + column_names=dataframe_or_view.columns, + raw_summary_statistics=raw_summary_statistics, + ) + return _build_column_information_by_name( + summary_statistics, + get_getml_role_by_column=dataframe_or_view.roles.column, + column_names=dataframe_or_view.columns, + ) + + +def build_column_information_by_name_for_parquet( + parquet_filepath: Path, + dataframe_name: str, + get_getml_role_by_column: Callable[[str], GetMLRole], + column_names: Sequence[str], +) -> ColumnInformationByName: + """Build column information for each column in a Parquet file. + + Args: + parquet_filepath: Path to the Parquet file. + dataframe_name: Name of the DataFrame represented by the Parquet file. + get_getml_role_by_column: A callable that returns the getML role + for a given column name. + column_names: A sequence of column names present in the Parquet file. + + Returns: + A mapping from column names to their corresponding ColumnInformation. + + """ + raw_summary_statistics = fetch_raw_summary_statistics_from_parquet(parquet_filepath) + summary_statistics = _build_column_statistics_by_name( + dataframe_name=dataframe_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + raw_summary_statistics=raw_summary_statistics, + ) + return _build_column_information_by_name( + summary_statistics, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + ) + + +def build_column_information_by_name_for_table( + table: pa.Table, # pyright: ignore [reportUnknownMemberType, reportUnknownParameterType] + dataframe_name: str, + get_getml_role_by_column: Callable[[str], GetMLRole], + column_names: Sequence[str], +) -> ColumnInformationByName: + """Build column information for each column in a PyArrow Table. + + Args: + table: A PyArrow Table. + dataframe_name: Name of the DataFrame represented by the Table. + get_getml_role_by_column: A callable that returns the getML role + for a given column name. + column_names: A sequence of column names present in the Table. + + Returns: + A mapping from column names to their corresponding ColumnInformation. + + """ + raw_summary_statistics = fetch_raw_summary_statistics_for_table(table) # pyright: ignore [reportUnknownArgumentType] + summary_statistics = _build_column_statistics_by_name( + dataframe_name=dataframe_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + raw_summary_statistics=raw_summary_statistics, + ) + return _build_column_information_by_name( + summary_statistics, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + ) + + +def _build_column_information_by_name( + summary_statistics: Mapping[str, ColumnStatistics], + get_getml_role_by_column: Callable[[str], GetMLRole], + column_names: Sequence[str], +) -> ColumnInformationByName: + return { + column_name: ColumnInformation( + name=column_name, + role=Role(get_getml_role_by_column(column_name)), + statistics=summary_statistics[column_name], + ) + for column_name in column_names + } + + +def _build_column_statistics_by_name( + dataframe_name: str, + get_getml_role_by_column: Callable[[str], GetMLRole], + column_names: Sequence[str], + raw_summary_statistics: SummaryStatisticsType, +) -> dict[str, ColumnStatistics]: + return { + column_name: _get_column_statistics_type( + dataframe_name, + column_name, + get_getml_role_by_column(column_name), + assume_is_str(raw_summary_statistics[column_name]["column_type"]), + ).model_validate( + raw_summary_statistics[column_name], + ) + for column_name in column_names + } + + +def _get_column_statistics_type( + dataframe_name: str, + column_name: str, + column_role: GetMLRole, + column_type: str, +) -> type[ColumnStatistics]: + role = convert_role(column_role) + column_statistics_type = ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING.get( + ( + role, + ColumnType(column_type), + ), + ) + if column_statistics_type is None: + raise UnsupportedColumnStatisticsError( + dataframe_name, + column_name, + role, + column_type, + ) + return column_statistics_type diff --git a/src/getml_io/serialize/container.py b/src/getml_io/serialize/container.py index 0af2bd8..e72a37c 100644 --- a/src/getml_io/serialize/container.py +++ b/src/getml_io/serialize/container.py @@ -12,11 +12,9 @@ DataFrameInformationByName, ) from getml_io.serialize.container_information import serialize_container_information -from getml_io.serialize.dataframe_information import ( - derive_instance_with_relative_path, - derive_instances_with_relative_path, +from getml_io.serialize.dataframe_or_view import ( + serialize_dataframe_or_view, ) -from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view from getml_io.utils.convert import ( assume_is_bool, assume_is_dict_str_to_dataframe_or_view, @@ -29,108 +27,112 @@ def serialize_container( container: Container, - target_storage_directory: Path, -) -> tuple[Path, ContainerInformation]: - """Serialize a getML container into the target storage directory. + *, + target_storage_directory: Path | None = None, +) -> ContainerInformation: + """Serialize a GetML Container. Args: - container: The container to serialize. + container: The GetML Container to serialize. target_storage_directory: The directory where the serialized container - will be saved. + will be saved. If None, the container will not be saved to disk. Returns: - tuple[Path, ContainerInformation]: A tuple containing: - Path: The path to the serialized container information file. - ContainerInformation: The serialized container information. + ContainerInformation: The serialized container information. """ - container_storage_directory = target_storage_directory / "container" + container_storage_directory = ( + (target_storage_directory / "container") if target_storage_directory else None + ) population_information = serialize_population( - container, - container_storage_directory, + container=container, + container_storage_directory=container_storage_directory, ) peripheral_information = serialize_peripheral( - container, - container_storage_directory, + container=container, + container_storage_directory=container_storage_directory, ) subsets_information = serialize_subsets( - container, - container_storage_directory, + container=container, + container_storage_directory=container_storage_directory, ) container_information = ContainerInformation( id=assume_is_str(container.id), - population=derive_instance_with_relative_path( - population_information, - target_storage_directory, - ) - if population_information - else None, - peripheral=derive_instances_with_relative_path( - peripheral_information, - target_storage_directory, - ), - subsets=derive_instances_with_relative_path( - subsets_information, - target_storage_directory, - ), + population=population_information, + peripheral=peripheral_information, + subsets=subsets_information, deep_copy=assume_is_bool(container.deep_copy), ) - container_information_json_path = serialize_container_information( - container_information, - target_storage_directory, - ) - return container_information_json_path, container_information + if target_storage_directory is not None: + _ = serialize_container_information( + container_information, + target_storage_directory, + ) + return container_information def serialize_population( container: Container, - container_storage_directory: Path, + *, + container_storage_directory: Path | None = None, ) -> DataFrameInformation | None: - """Serialize the population of a container into the container storage directory. + """Serialize the population dataframe or view of a GetML Container. Args: - container: The container to serialize. + container: The GetML Container containing the population dataframe or view. container_storage_directory: The directory where the serialized population - will be saved. + will be saved. If None, the population will not be saved to disk. Returns: - DataFrameInformation | None: The serialized population information or None - if the population is not set. + DataFrameInformation | None: The serialized population information, + or None if the population is not set. """ population = assume_is_optional_dataframe_or_view(container.population) if population is None: return None + population_storage_directory = ( + (container_storage_directory / "population") + if container_storage_directory is not None + else None + ) + return serialize_dataframe_or_view( - population, - container_storage_directory / "population", + dataframe_or_view=population, + target_storage_directory=population_storage_directory, ) def serialize_peripheral( container: Container, - container_storage_directory: Path, + *, + container_storage_directory: Path | None = None, ) -> DataFrameInformationByName: - """Serialize the peripherals of a container into the container storage directory. + """Serialize the peripheral dataframes or views of a GetML Container. Args: - container: The container to serialize. - container_storage_directory: The directory where the serialized peripherals - will be saved. + container: The GetML Container containing the peripheral dataframes or views. + container_storage_directory: The directory where the serialized peripheral + dataframes or views will be saved. If None, the peripheral dataframes + or views will not be saved to disk. Returns: - DataFrameInformationByName: A dictionary mapping peripheral names to - their serialized DataFrameInformation. + DataFrameInformationByName: A dictionary mapping peripheral names to their + serialized information. """ - peripheral_storage_directory = container_storage_directory / "peripheral" + peripheral_storage_directory = ( + (container_storage_directory / "peripheral") + if container_storage_directory is not None + else None + ) peripherals = assume_is_dict_str_to_dataframe_or_view(container.peripheral) return { name: serialize_dataframe_or_view( - dataframe_or_view, - peripheral_storage_directory, + dataframe_or_view=dataframe_or_view, + target_storage_directory=peripheral_storage_directory, filename_prefix=name, ) for name, dataframe_or_view in peripherals.items() @@ -139,26 +141,32 @@ def serialize_peripheral( def serialize_subsets( container: Container, - container_storage_directory: Path, + *, + container_storage_directory: Path | None = None, ) -> DataFrameInformationByName: - """Serialize the subsets of a container into the container storage directory. + """Serialize the subset dataframes or views of a GetML Container. Args: - container: The container to serialize. - container_storage_directory: The directory where the serialized subsets - will be saved. + container: The GetML Container containing the subset dataframes or views. + container_storage_directory: The directory where the serialized subset + dataframes or views will be saved. If None, the subset dataframes + or views will not be saved to disk. Returns: - DataFrameInformationByName: A dictionary mapping subset names to - their serialized DataFrameInformation. + DataFrameInformationByName: A dictionary mapping subset names to their + serialized information. """ - subsets_storage_directory = container_storage_directory / "subsets" + subsets_storage_directory = ( + (container_storage_directory / "subsets") + if container_storage_directory is not None + else None + ) subsets = assume_is_dict_str_to_dataframe_or_view(container.subsets) return { name: serialize_dataframe_or_view( - dataframe_or_view, - subsets_storage_directory, + dataframe_or_view=dataframe_or_view, + target_storage_directory=subsets_storage_directory, filename_prefix=name, ) for name, dataframe_or_view in subsets.items() diff --git a/src/getml_io/serialize/container_information.py b/src/getml_io/serialize/container_information.py index adee2f8..b7c09d6 100644 --- a/src/getml_io/serialize/container_information.py +++ b/src/getml_io/serialize/container_information.py @@ -1,5 +1,7 @@ from __future__ import annotations +import logging +from logging import Logger from pathlib import Path from getml_io.metadata.container_information import ContainerInformation @@ -8,6 +10,8 @@ ContainerInformationStorageError, ) +logger: Logger = logging.getLogger(__name__) + def serialize_container_information( container_information: ContainerInformation, @@ -38,13 +42,17 @@ def serialize_container_information( container_information.id, ) from exception - container_json_path = target_storage_directory / "container.json" + container_information_json_path = target_storage_directory / "container.json" try: - _ = container_json_path.write_text(container_information_json) + _ = container_information_json_path.write_text(container_information_json) except Exception as exception: raise ContainerInformationStorageError( container_information.id, - container_json_path, + container_information_json_path, ) from exception - return container_json_path + logger.info( + "Serialized Container to %s", + container_information_json_path, + ) + return container_information_json_path diff --git a/src/getml_io/serialize/data_model.py b/src/getml_io/serialize/data_model.py index f65b639..ef5591e 100644 --- a/src/getml_io/serialize/data_model.py +++ b/src/getml_io/serialize/data_model.py @@ -6,22 +6,22 @@ from getml_io.metadata.data_model_information import ( DataModelInformation, ) -from getml_io.serialize.placeholder import serialize_placeholder +from getml_io.serialize.placeholder import convert_placeholder -def serialize_data_model(data_model: DataModel) -> DataModelInformation: - """Serialize a getML DataModel into a DataModelInformation object. +def convert_data_model(data_model: DataModel) -> DataModelInformation: + """Convert a getML DataModel into a DataModelInformation object. Args: - data_model: The DataModel to serialize. + data_model: The DataModel to convert. Returns: - DataModelInformation: The serialized DataModel information. + DataModelInformation: The converted DataModel information. """ peripheral = { name: [ - serialize_placeholder(placeholder) + convert_placeholder(placeholder) for placeholder in ( [placeholders] if isinstance(placeholders, Placeholder) @@ -34,7 +34,7 @@ def serialize_data_model(data_model: DataModel) -> DataModelInformation: ).items() } return DataModelInformation( - population=serialize_placeholder( + population=convert_placeholder( data_model.population, ), peripheral=peripheral, diff --git a/src/getml_io/serialize/dataframe_information.py b/src/getml_io/serialize/dataframe_information.py deleted file mode 100644 index 5e35592..0000000 --- a/src/getml_io/serialize/dataframe_information.py +++ /dev/null @@ -1,61 +0,0 @@ -from pathlib import Path - -from getml_io.metadata.dataframe_information import ( - DataFrameInformation, - DataFrameInformationByName, -) -from getml_io.metadata.exception import DataFrameInformationPathNotRelativeError - - -def derive_instance_with_relative_path( - dataframe_information: DataFrameInformation, - base_path: Path, -) -> DataFrameInformation: - """Derive a copy of an instance with a path relative to the given base path. - - Args: - dataframe_information: The instance to use as a template. - base_path: The base path to which the instance's path should be relative. - - Returns: - A new instance with the path relative to the base path. - - Raises: - DataFrameInformationPathNotRelativeError: If the instance's path cannot be made - relative to the base path. - - """ - try: - return dataframe_information.model_copy( - update={ - "path": dataframe_information.path.relative_to(base_path), - }, - ) - except Exception as exception: - raise DataFrameInformationPathNotRelativeError( - dataframe_information.name, - dataframe_information.path, - base_path, - ) from exception - - -def derive_instances_with_relative_path( - dataframe_information_by_name: DataFrameInformationByName, - base_path: Path, -) -> DataFrameInformationByName: - """Derive copies of instances with paths relative to the given base path. - - Args: - dataframe_information_by_name: A dictionary mapping names to - DataFrameInformation instances. - base_path: The base path to which the instances' paths should be relative. - - Returns: - A new dictionary with the same names, but with DataFrameInformation instances - having paths relative to the base path. - - """ - return { - name: derive_instance_with_relative_path(dataframe_information, base_path) - for name, dataframe_information in dataframe_information_by_name.items() - } diff --git a/src/getml_io/serialize/dataframe_or_view.py b/src/getml_io/serialize/dataframe_or_view.py index c5e0507..a19fca5 100644 --- a/src/getml_io/serialize/dataframe_or_view.py +++ b/src/getml_io/serialize/dataframe_or_view.py @@ -13,48 +13,57 @@ from getml_io.metadata.dataframe_information import ( DataFrameInformation, ) -from getml_io.serialize.parquet import serialize_dataframe +from getml_io.serialize.column_information import ( + build_column_information_by_name_for_dataframe, +) +from getml_io.serialize.parquet import serialize_object from getml_io.utils.convert import assume_is_str logger: Logger = logging.getLogger(__name__) -def _save_dataframe_or_view_as_parquet( - path: Path, - dataframe_or_view: DataFrame | View, -) -> None: - dataframe_or_view.to_parquet(str(path)) - - def serialize_dataframe_or_view( dataframe_or_view: DataFrame | View, - target_storage_directory: Path, *, + target_storage_directory: Path | None = None, filename_prefix: str | None = None, ) -> DataFrameInformation: - """Serialize a getML DataFrame or View into the target storage directory. + """Serialize a GetML DataFrame or View. Args: - dataframe_or_view: The DataFrame or View to serialize. - target_storage_directory: The directory where the serialized DataFrame or View - will be saved. - filename_prefix: An optional prefix for the filename. If provided - and different from the DataFrame or View name, it will be used as a - prefix for the filename, followed by the DataFrame or View name. - Else, only the DataFrame or View name will be used as the filename. + dataframe_or_view: The GetML DataFrame or View to serialize. + target_storage_directory: The directory where the serialized + DataFrame or View will be saved. If None, the DataFrame or View + will not be saved to disk. + filename_prefix: An optional prefix to add to the filename of the + saved Parquet file. Returns: DataFrameInformation: The serialized DataFrame or View information. """ - return serialize_dataframe( + if target_storage_directory is None: + return DataFrameInformation( + name=assume_is_str(dataframe_or_view.name), + columns=build_column_information_by_name_for_dataframe( + dataframe_or_view, + ), + ) + return serialize_object( target_storage_directory=target_storage_directory, save_parquet=functools.partial( _save_dataframe_or_view_as_parquet, dataframe_or_view=dataframe_or_view, ), - dataframe_name=assume_is_str(dataframe_or_view.name), + object_name=assume_is_str(dataframe_or_view.name), get_getml_role_by_column=dataframe_or_view.roles.column, column_names=dataframe_or_view.columns, filename_prefix=filename_prefix, ) + + +def _save_dataframe_or_view_as_parquet( + path: Path, + dataframe_or_view: DataFrame | View, +) -> None: + dataframe_or_view.to_parquet(str(path)) diff --git a/src/getml_io/serialize/exception.py b/src/getml_io/serialize/exception.py index b64904b..ee3f4e4 100644 --- a/src/getml_io/serialize/exception.py +++ b/src/getml_io/serialize/exception.py @@ -3,7 +3,7 @@ from getml.pipeline.score import Score as GetMLScore from getml_io.getml.roles import Role -from getml_io.metadata.dataframe_information import ( +from getml_io.metadata.column_statistics import ( ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, ) from getml_io.utils.exception import GetMLIOError diff --git a/src/getml_io/serialize/parquet.py b/src/getml_io/serialize/parquet.py index b819bf7..70cb799 100644 --- a/src/getml_io/serialize/parquet.py +++ b/src/getml_io/serialize/parquet.py @@ -1,209 +1,110 @@ from __future__ import annotations import logging -from collections.abc import Callable, Mapping, Sequence +from collections.abc import Callable, Sequence from logging import Logger from pathlib import Path -from typing import cast -import duckdb -import pyarrow as pa -import pyarrow.parquet as pq from getml.data.roles.types import Role as GetMLRole -from getml_io.getml.roles import Role from getml_io.metadata.dataframe_information import ( - ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, - ColumnInformation, - ColumnStatistics, - ColumnType, DataFrameInformation, ) +from getml_io.serialize.column_information import ( + build_column_information_by_name_for_parquet, +) from getml_io.serialize.exception import ( DataFrameParquetStorageError, - UnsupportedColumnStatisticsError, ) -from getml_io.serialize.roles import serialize_role -from getml_io.utils.convert import assume_is_str from getml_io.utils.exception import StorageDirectoryCreationError logger: Logger = logging.getLogger(__name__) -def save_table_as_parquet(path: Path, table: pa.Table) -> None: # pyright: ignore [reportUnknownParameterType, reportUnknownMemberType] - """Save a PyArrow Table as a Parquet file.""" - pq.write_table(table, path) # pyright: ignore [reportUnknownMemberType, reportUnknownArgumentType] - - -def serialize_dataframe( # noqa: PLR0913 +def serialize_object( # noqa: PLR0913 target_storage_directory: Path, save_parquet: Callable[[Path], None], - dataframe_name: str, + object_name: str, get_getml_role_by_column: Callable[[str], GetMLRole], column_names: Sequence[str], *, filename_prefix: str | None = None, ) -> DataFrameInformation: - """Serialize a dataframe into the target storage directory. + """Serialize an object to parquet. Args: - target_storage_directory: The directory where the serialized dataframe - will be saved. - save_parquet: A callable that saves the dataframe as a Parquet file + target_storage_directory: The directory where the serialized + object will be saved. + save_parquet: A callable that saves the object as a Parquet file to a given path. - dataframe_name: The name of the dataframe. - get_getml_role_by_column: A callable that returns the getML role + object_name: The name of the object. + get_getml_role_by_column: A callable that returns the GetML role for a given column name. - column_names: The names of the columns in the dataframe. - filename_prefix: An optional prefix for the filename. - If provided and different from the DataFrame name, it will be used - as a prefix for the filename, followed by the DataFrame name. - Else, only the DataFrame name will be used as the filename. + column_names: A sequence of column names in the object. + filename_prefix: An optional prefix to add to the filename of the + saved Parquet file. If None, no prefix is added. Returns: - DataFrameInformation: The serialized DataFrame information. - - Raises: - StorageDirectoryCreationError: If the target storage directory - cannot be created. - DataFrameParquetStorageError: If storing the DataFrame as a Parquet file fails. + DataFrameInformation: The serialized object information. """ + parquet_filepath = _save_object_as_parquet( + target_storage_directory=target_storage_directory, + save_parquet=save_parquet, + object_name=object_name, + filename_prefix=filename_prefix, + ) + + return _extract_dataframe_information( + parquet_filepath=parquet_filepath, + dataframe_name=object_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + ) + + +def _save_object_as_parquet( + target_storage_directory: Path, + save_parquet: Callable[[Path], None], + object_name: str, + *, + filename_prefix: str | None = None, +) -> Path: try: target_storage_directory.mkdir(parents=True, exist_ok=True) except Exception as exception: raise StorageDirectoryCreationError(target_storage_directory) from exception filename = ( - f"{filename_prefix}.{dataframe_name}" - if filename_prefix and filename_prefix != dataframe_name - else dataframe_name + f"{filename_prefix}.{object_name}" + if filename_prefix and filename_prefix != object_name + else object_name ) parquet_filepath = target_storage_directory / f"{filename}.parquet" try: save_parquet(parquet_filepath) except Exception as exception: raise DataFrameParquetStorageError( - dataframe_name, + object_name, parquet_filepath, ) from exception - - column_information_by_name = _build_column_information_by_name( - parquet_filepath, - dataframe_name, - get_getml_role_by_column, - column_names, - ) - - return DataFrameInformation( - name=dataframe_name, - path=parquet_filepath, - columns=column_information_by_name, - ) + return parquet_filepath -def _build_column_information_by_name( +def _extract_dataframe_information( parquet_filepath: Path, dataframe_name: str, get_getml_role_by_column: Callable[[str], GetMLRole], column_names: Sequence[str], -) -> dict[str, ColumnInformation]: - summary_statistics = _calculate_summary_statistics( - parquet_filepath, - dataframe_name, - get_getml_role_by_column, - column_names, - ) - return { - column_name: ColumnInformation( - name=column_name, - role=Role(get_getml_role_by_column(column_name)), - statistics=summary_statistics[column_name], - ) - for column_name in column_names - } - - -def _calculate_summary_statistics( - parquet_filepath: Path, - dataframe_name: str, - get_getml_role_by_column: Callable[[str], GetMLRole], - column_names: Sequence[str], -) -> dict[str, ColumnStatistics]: - raw_summary_statistics = _fetch_raw_summary_statistics(parquet_filepath) - return _build_column_statistics_by_name( - dataframe_name, - get_getml_role_by_column, - column_names, - raw_summary_statistics, +) -> DataFrameInformation: + column_information_by_name = build_column_information_by_name_for_parquet( + parquet_filepath=parquet_filepath, + dataframe_name=dataframe_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, ) - -SUMMARIZE_STATEMENT_TEMPLATE = "SUMMARIZE (SELECT * FROM read_parquet(?))" - - -def _fetch_raw_summary_statistics( - parquet_filepath: Path, -) -> dict[str, dict[str, str | int | float | None]]: - with ( - duckdb.connect() as connection, # pyright: ignore [reportUnknownMemberType] - ): - logger.debug( - "Calculating summary statistics for Parquet '%s'", - parquet_filepath, - ) - return cast( - "dict[str, dict[str, str | int | float | None]]", - cast( - "object", - connection.execute( # pyright: ignore [reportUnknownMemberType] - SUMMARIZE_STATEMENT_TEMPLATE, - [str(parquet_filepath)], - ) - .df() - .set_index("column_name") - .to_dict(orient="index"), - ), - ) - - -def _build_column_statistics_by_name( - dataframe_name: str, - get_getml_role_by_column: Callable[[str], GetMLRole], - column_names: Sequence[str], - raw_summary_statistics: Mapping[str, Mapping[str, str | int | float | None]], -) -> dict[str, ColumnStatistics]: - return { - column_name: _get_column_statistics_type( - dataframe_name, - column_name, - get_getml_role_by_column(column_name), - assume_is_str(raw_summary_statistics[column_name]["column_type"]), - ).model_validate( - raw_summary_statistics[column_name], - ) - for column_name in column_names - } - - -def _get_column_statistics_type( - dataframe_name: str, - column_name: str, - column_role: GetMLRole, - column_type: str, -) -> type[ColumnStatistics]: - role = serialize_role(column_role) - column_statistics_type = ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING.get( - ( - role, - ColumnType(column_type), - ), + return DataFrameInformation( + name=dataframe_name, + columns=column_information_by_name, ) - if column_statistics_type is None: - raise UnsupportedColumnStatisticsError( - dataframe_name, - column_name, - role, - column_type, - ) - return column_statistics_type diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index 5d5dec9..f8a6ba9 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -1,5 +1,4 @@ import dataclasses -import functools from collections.abc import Sequence from pathlib import Path from typing import cast @@ -63,22 +62,24 @@ ) from getml_io.getml.scores import ClassificationScore, RegressionScore, Scores from getml_io.getml.tables import Table -from getml_io.metadata.dataframe_information import DataFrameInformationByName +from getml_io.metadata.dataframe_information import ( + DataFrameInformationByName, +) from getml_io.metadata.pipeline_information import ( LossFunction, PipelineInformation, ) -from getml_io.serialize.data_model import serialize_data_model -from getml_io.serialize.dataframe_information import derive_instances_with_relative_path -from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view -from getml_io.serialize.exception import WrongPipelineScoreTypeError -from getml_io.serialize.parquet import ( - save_table_as_parquet, # pyright: ignore [reportUnknownVariableType] - serialize_dataframe, +from getml_io.serialize.data_model import convert_data_model +from getml_io.serialize.dataframe_or_view import ( + serialize_dataframe_or_view, ) +from getml_io.serialize.exception import WrongPipelineScoreTypeError from getml_io.serialize.pipeline_information import serialize_pipeline_information -from getml_io.serialize.placeholder import serialize_placeholder -from getml_io.serialize.roles import serialize_roles +from getml_io.serialize.placeholder import convert_placeholder +from getml_io.serialize.roles import convert_roles +from getml_io.serialize.table import ( + serialize_table, # pyright: ignore [reportUnknownVariableType] +) from getml_io.utils.convert import ( assume_is_dict_str_to_dataframe_or_view, ) @@ -87,47 +88,49 @@ def serialize_pipeline( pipeline: Pipeline, container: Container, - target_storage_directory: Path, -) -> tuple[Path, PipelineInformation]: - """Serialize a getML Pipeline into the target storage directory. + *, + target_storage_directory: Path | None = None, +) -> PipelineInformation: + """Serialize a getML Pipeline. Args: - pipeline: The Pipeline to serialize. - container: The Container that the Pipeline operates on. + pipeline: The getML Pipeline to serialize. + container: The getML Container used with the Pipeline. target_storage_directory: The directory where the serialized Pipeline - will be saved. + will be saved. If None, the Pipeline will not be saved to disk. Returns: - tuple[Path, PipelineInformation]: A tuple containing: - Path: The path to the serialized Pipeline information file. - PipelineInformation: The serialized Pipeline information. + PipelineInformation: The serialized Pipeline information. """ - pipeline_storage_directory = target_storage_directory / "pipeline" + pipeline_storage_directory = ( + (target_storage_directory / "pipeline") + if target_storage_directory is not None + else None + ) + + predictions = serialize_predictions( + pipeline=pipeline, + container=container, + target_storage_directory=pipeline_storage_directory, + ) + + feature_sets = serialize_feature_sets( + pipeline=pipeline, + container=container, + target_storage_directory=pipeline_storage_directory, + ) + pipeline_information = PipelineInformation( id=pipeline.id, - predictions=derive_instances_with_relative_path( - serialize_predictions( - pipeline=pipeline, - container=container, - target_storage_directory=pipeline_storage_directory, - ), - target_storage_directory, - ), - feature_sets=derive_instances_with_relative_path( - serialize_feature_sets( - pipeline=pipeline, - container=container, - target_storage_directory=pipeline_storage_directory, - ), - target_storage_directory, - ), + predictions=predictions, + feature_sets=feature_sets, feature_learners=[ - serialize_feature_learner(feature_learner) + convert_feature_learner(feature_learner) for feature_learner in pipeline.feature_learners ], feature_selectors=[ - serialize_predictor(feature_selector) + convert_predictor(feature_selector) for feature_selector in pipeline.feature_selectors ], include_categorical=pipeline.include_categorical, @@ -135,51 +138,56 @@ def serialize_pipeline( is_regression=pipeline.is_regression, loss_function=LossFunction(pipeline.loss_function), peripheral=[ - serialize_placeholder(placeholder) for placeholder in pipeline.peripheral - ], - predictors=[ - serialize_predictor(predictor) for predictor in pipeline.predictors + convert_placeholder(placeholder) for placeholder in pipeline.peripheral ], + predictors=[convert_predictor(predictor) for predictor in pipeline.predictors], preprocessors=[ - serialize_preprocessor(preprocessor) + convert_preprocessor(preprocessor) for preprocessor in pipeline.preprocessors ], share_selected_features=pipeline.share_selected_features, tags=pipeline.tags, targets=pipeline.targets, - data_model=serialize_data_model(pipeline.data_model), - features=serialize_features(pipeline.features), - scores=serialize_scores(pipeline.scores), - columns=serialize_columns(pipeline.columns), - metadata=serialize_all_metadata(pipeline.metadata), - tables=serialize_tables(pipeline.tables), - ) - pipeline_information_json_path = serialize_pipeline_information( - pipeline_information=pipeline_information, - target_storage_directory=target_storage_directory, + data_model=convert_data_model(pipeline.data_model), + features=convert_features(pipeline.features), + scores=convert_scores(pipeline.scores), + columns=convert_columns(pipeline.columns), + metadata=convert_all_metadata(pipeline.metadata), + tables=convert_tables(pipeline.tables), ) - return pipeline_information_json_path, pipeline_information + + if target_storage_directory is not None: + _ = serialize_pipeline_information( + pipeline_information=pipeline_information, + target_storage_directory=target_storage_directory, + ) + + return pipeline_information def serialize_predictions( pipeline: Pipeline, container: Container, - target_storage_directory: Path, + *, + target_storage_directory: Path | None = None, ) -> DataFrameInformationByName: - """Serialize the predictions created from all subsets of a Container. + """Serialize predictions made by a getML Pipeline. Args: - pipeline: The Pipeline to use for making predictions. - container: The Container that the Pipeline operates on. + pipeline: The getML Pipeline to use for predictions. + container: The getML Container to make predictions on. target_storage_directory: The directory where the serialized predictions - will be saved. + will be saved. If None, the predictions will not be saved to disk. Returns: - DataFrameInformationByName: A dictionary-like object containing the serialized - predictions for each subset of the Container. + DataFrameInformationByName: The serialized predictions information. """ - predict_storage_directory = target_storage_directory / "predictions" + predict_storage_directory = ( + (target_storage_directory / "predictions") + if target_storage_directory is not None + else None + ) prediction_results: DataFrameInformationByName = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): prediction = cast( @@ -191,16 +199,12 @@ def serialize_predictions( prediction.T, names=column_names, ) - - prediction_results[subset_name] = serialize_dataframe( - target_storage_directory=predict_storage_directory, - save_parquet=functools.partial( - save_table_as_parquet, - table=prediction_table, - ), - dataframe_name=f"prediction.{subset_name}", + prediction_results[subset_name] = serialize_table( + table=prediction_table, # pyright: ignore [reportUnknownArgumentType] + table_name=f"prediction.{subset_name}", get_getml_role_by_column=(lambda _column_name: getml_roles.target), column_names=column_names, + target_storage_directory=predict_storage_directory, ) return prediction_results @@ -209,51 +213,55 @@ def serialize_predictions( def serialize_feature_sets( pipeline: Pipeline, container: Container, - target_storage_directory: Path, + *, + target_storage_directory: Path | None = None, ) -> DataFrameInformationByName: - """Serialize the feature sets created from all subsets of a Container. + """Serialize feature sets generated by a getML Pipeline. Args: - pipeline: The Pipeline to use for transforming the subsets. - container: The Container that the Pipeline operates on. + pipeline: The getML Pipeline to use for generating feature sets. + container: The getML Container to generate feature sets on. target_storage_directory: The directory where the serialized feature sets - will be saved. + will be saved. If None, the feature sets will not be saved to disk. Returns: - DataFrameInformationByName: A dictionary-like object containing the serialized - feature sets for each subset of the Container. + DataFrameInformationByName: The serialized feature sets information. """ - transform_storage_directory = target_storage_directory / "feature_sets" + transform_storage_directory = ( + (target_storage_directory / "feature_sets") + if target_storage_directory is not None + else None + ) feature_sets: DataFrameInformationByName = {} + for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): features = pipeline.transform( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] container[subset_name], df_name=f"features.{subset_name}", ) - dataframe_information = serialize_dataframe_or_view( + feature_sets[subset_name] = serialize_dataframe_or_view( cast("DataFrame", features), - transform_storage_directory, + target_storage_directory=transform_storage_directory, ) - feature_sets[subset_name] = dataframe_information return feature_sets -def serialize_feature_learner( +def convert_feature_learner( feature_learner: getml_feature_learner.Fastboost | getml_feature_learner.FastProp | getml_feature_learner.Multirel | getml_feature_learner.Relboost | getml_feature_learner.RelMT, ) -> FeatureLearner: - """Serialize a getML FeatureLearner into a FeatureLearner object. + """Convert a getML FeatureLearner into a FeatureLearner object. Args: - feature_learner: The FeatureLearner to serialize. + feature_learner: The FeatureLearner to convert. Returns: - FeatureLearner: The serialized FeatureLearner information. + FeatureLearner: The converted FeatureLearner information. """ feature_learner_as_dict = dataclasses.asdict(feature_learner) @@ -270,7 +278,7 @@ def serialize_feature_learner( return RelMT.model_validate(feature_learner_as_dict) -def serialize_predictor( +def convert_predictor( predictor: getml_predictor.LinearRegression | getml_predictor.LogisticRegression | getml_predictor.ScaleGBMClassifier @@ -278,13 +286,13 @@ def serialize_predictor( | getml_predictor.XGBoostClassifier | getml_predictor.XGBoostRegressor, ) -> Predictor: - """Serialize a getML Predictor into a Predictor object. + """Convert a getML Predictor into a Predictor object. Args: - predictor: The Predictor to serialize. + predictor: The Predictor to convert. Returns: - Predictor: The serialized Predictor information. + Predictor: The converted Predictor information. """ predictor_as_dict = dataclasses.asdict(predictor) @@ -303,7 +311,7 @@ def serialize_predictor( return XGBoostRegressor.model_validate(predictor_as_dict) -def serialize_preprocessor( # noqa: PLR0911 +def convert_preprocessor( # noqa: PLR0911 preprocessor: getml_preprocessor.CategoryTrimmer | getml_preprocessor.EmailDomain | getml_preprocessor.Imputation @@ -312,13 +320,13 @@ def serialize_preprocessor( # noqa: PLR0911 | getml_preprocessor.Substring | getml_preprocessor.TextFieldSplitter, ) -> Preprocessor: - """Serialize a getML Preprocessor into a Preprocessor object. + """Convert a getML Preprocessor into a Preprocessor object. Args: - preprocessor: The Preprocessor to serialize. + preprocessor: The Preprocessor to convert. Returns: - Preprocessor: The serialized Preprocessor information. + Preprocessor: The converted Preprocessor information. """ preprocessor_as_dict = dataclasses.asdict(preprocessor) @@ -339,14 +347,14 @@ def serialize_preprocessor( # noqa: PLR0911 return TextFieldSplitter.model_validate(preprocessor_as_dict) -def serialize_features(features: GetMLFeatures) -> Features: - """Serialize getML Features into a Features object. +def convert_features(features: GetMLFeatures) -> Features: + """Convert getML Features into a Features object. Args: - features: The getML Features to serialize. + features: The getML Features to convert. Returns: - Features: The serialized Features information. + Features: The converted Features information. """ return { @@ -362,24 +370,24 @@ def serialize_features(features: GetMLFeatures) -> Features: } -def serialize_scores(scores: GetMLScores) -> Scores: - """Serialize getML Scores into a Scores object. +def convert_scores(scores: GetMLScores) -> Scores: + """Convert getML Scores into a Scores object. Args: - scores: The getML Scores to serialize. + scores: The getML Scores to convert. Returns: - Scores: The serialized Scores information. + Scores: The converted Scores information. """ return ( - _serialize_classification_scores(list(scores)) + _convert_classification_scores(list(scores)) if scores.is_classification - else _serialize_regression_scores(list(scores)) + else _convert_regression_scores(list(scores)) ) -def _serialize_classification_scores( +def _convert_classification_scores( scores: list[GetMLScore], ) -> list[ClassificationScore]: classification_scores: list[ClassificationScore] = [] @@ -402,7 +410,7 @@ def _serialize_classification_scores( return classification_scores -def _serialize_regression_scores(scores: list[GetMLScore]) -> list[RegressionScore]: +def _convert_regression_scores(scores: list[GetMLScore]) -> list[RegressionScore]: regression_scores: list[RegressionScore] = [] for score in scores: if not isinstance(score, GetMLRegressionScore): @@ -423,14 +431,14 @@ def _serialize_regression_scores(scores: list[GetMLScore]) -> list[RegressionSco return regression_scores -def serialize_columns(getml_columns: GetMLColumns | None) -> list[Column]: - """Serialize getML Columns into a list of Column objects. +def convert_columns(getml_columns: GetMLColumns | None) -> list[Column]: + """Convert getML Columns into a list of Column objects. Args: - getml_columns: The getML Columns to serialize. + getml_columns: The getML Columns to convert. Returns: - list[Column]: The serialized Columns information. + list[Column]: The converted Columns information. """ if getml_columns is None: @@ -449,42 +457,42 @@ def serialize_columns(getml_columns: GetMLColumns | None) -> list[Column]: ] -def serialize_all_metadata(all_metadata: AllMetadata | None) -> PipelineMetaData: - """Serialize getML AllMetadata into a PipelineMetaData object. +def convert_all_metadata(all_metadata: AllMetadata | None) -> PipelineMetaData: + """Convert getML AllMetadata into a PipelineMetaData object. Args: - all_metadata: The getML AllMetadata to serialize. + all_metadata: The getML AllMetadata to convert. Returns: - PipelineMetaData: The serialized PipelineMetaData information. + PipelineMetaData: The converted PipelineMetaData information. """ if all_metadata is None: return PipelineMetaData(population=None, peripheral=[]) return PipelineMetaData( - population=_serialize_metadata(all_metadata.population), + population=_convert_metadata(all_metadata.population), peripheral=[ - _serialize_metadata(metadata) for metadata in all_metadata.peripheral + _convert_metadata(metadata) for metadata in all_metadata.peripheral ], ) -def _serialize_metadata(metadata: GetMLMetadata) -> DataFrameMetaData: +def _convert_metadata(metadata: GetMLMetadata) -> DataFrameMetaData: return DataFrameMetaData( name=metadata.name, - roles=serialize_roles(metadata.roles), + roles=convert_roles(metadata.roles), ) -def serialize_tables(tables: GetMLTables) -> list[Table]: - """Serialize getML Tables into a list of Table objects. +def convert_tables(tables: GetMLTables) -> list[Table]: + """Convert getML Tables into a list of Table objects. Args: - tables: The getML Tables to serialize. + tables: The getML Tables to convert. Returns: - list[Table]: The serialized Tables information. + list[Table]: The converted Tables information. """ return [ diff --git a/src/getml_io/serialize/pipeline_information.py b/src/getml_io/serialize/pipeline_information.py index 5716447..e5dfe02 100644 --- a/src/getml_io/serialize/pipeline_information.py +++ b/src/getml_io/serialize/pipeline_information.py @@ -1,3 +1,5 @@ +import logging +from logging import Logger from pathlib import Path from getml_io.metadata.pipeline_information import PipelineInformation @@ -6,6 +8,8 @@ PipelineInformationStorageError, ) +logger: Logger = logging.getLogger(__name__) + def serialize_pipeline_information( pipeline_information: PipelineInformation, @@ -37,13 +41,17 @@ def serialize_pipeline_information( raise PipelineInformationSerializationError( pipeline_information.id, ) from exception - pipeline_json_path = target_storage_directory / "pipeline.json" + pipeline_information_json_path = target_storage_directory / "pipeline.json" try: - _ = pipeline_json_path.write_text(pipeline_information_json) + _ = pipeline_information_json_path.write_text(pipeline_information_json) except Exception as exception: raise PipelineInformationStorageError( pipeline_information.id, - pipeline_json_path, + pipeline_information_json_path, ) from exception - return pipeline_json_path + logger.info( + "Serialized Pipeline to %s", + pipeline_information_json_path, + ) + return pipeline_information_json_path diff --git a/src/getml_io/serialize/placeholder.py b/src/getml_io/serialize/placeholder.py index 456f037..13fe2b3 100644 --- a/src/getml_io/serialize/placeholder.py +++ b/src/getml_io/serialize/placeholder.py @@ -7,25 +7,25 @@ JoinInformation, PlaceholderInformation, ) -from getml_io.serialize.roles import serialize_roles +from getml_io.serialize.roles import convert_roles -def serialize_placeholder(placeholder: Placeholder) -> PlaceholderInformation: - """Serialize a getML Placeholder into a PlaceholderInformation object. +def convert_placeholder(placeholder: Placeholder) -> PlaceholderInformation: + """Convert a getML Placeholder into a PlaceholderInformation object. Args: - placeholder: The Placeholder to serialize. + placeholder: The Placeholder to convert. Returns: - PlaceholderInformation: The serialized Placeholder information. + PlaceholderInformation: The converted Placeholder information. """ return PlaceholderInformation( name=placeholder.name, - roles=serialize_roles(placeholder.roles), + roles=convert_roles(placeholder.roles), joins=[ JoinInformation( - right=serialize_placeholder(join.right), + right=convert_placeholder(join.right), on=join.on, time_stamps=join.time_stamps, upper_time_stamp=join.upper_time_stamp, diff --git a/src/getml_io/serialize/project.py b/src/getml_io/serialize/project.py index 8aeb79f..014f694 100644 --- a/src/getml_io/serialize/project.py +++ b/src/getml_io/serialize/project.py @@ -5,8 +5,11 @@ from pathlib import Path from getml_io.getml.project import load_project -from getml_io.getml.project_information import ProjectInformation -from getml_io.serialize.container import serialize_container +from getml_io.getml.project_identification import ProjectIdentification +from getml_io.metadata.project_information import ProjectInformation +from getml_io.serialize.container import ( + serialize_container, +) from getml_io.serialize.pipeline import serialize_pipeline from getml_io.utils.storage import create_target_storage_directory @@ -14,49 +17,51 @@ def serialize_project( - project_information: ProjectInformation, - root_storage_directory: Path, + project_identification: ProjectIdentification, *, - clear_storage_directory: bool, -) -> None: - """Serialize a getML project. + root_storage_directory: Path | None = None, + clear_storage_directory: bool = False, +) -> ProjectInformation: + """Serialize a GetML Project. Args: + project_identification: The identification of the GetML Project to + serialize. root_storage_directory: The directory where the serialized project - will be saved. - project_information: The project information for the project, - that is to be serialized. - clear_storage_directory: Whether to clear the storage directory - before serialization. + will be saved. If None, the project will not be saved to disk. + clear_storage_directory: Whether to clear the storage directory if it + already exists. Only relevant if root_storage_directory is not None. + Defaults to False. + + Returns: + ProjectInformation: The serialized project information. """ - target_storage_directory: Path = create_target_storage_directory( - project_information, - root_storage_directory, - clear_storage_directory=clear_storage_directory, + target_storage_directory: Path | None = ( + create_target_storage_directory( + project_identification=project_identification, + root_storage_directory=root_storage_directory, + clear_storage_directory=clear_storage_directory, + ) + if root_storage_directory is not None + else None ) with load_project( - project_information.project_name, - project_information.pipeline_id, - project_information.container_id, + project_identification.project_name, + project_identification.pipeline_id, + project_identification.container_id, ) as project: - container_information_json_path, container_information = serialize_container( - project.container, - target_storage_directory, - ) - logger.info( - "Serialized Container to %s: %r", - container_information_json_path, - container_information, + container_information = serialize_container( + container=project.container, + target_storage_directory=target_storage_directory, ) - pipeline_information_json_path, pipeline_information = serialize_pipeline( - project.pipeline, - project.container, - target_storage_directory, + pipeline_information = serialize_pipeline( + pipeline=project.pipeline, + container=project.container, + target_storage_directory=target_storage_directory, ) - logger.info( - "Serialized Pipeline to %s: %r", - pipeline_information_json_path, - pipeline_information, + return ProjectInformation( + container_information=container_information, + pipeline_information=pipeline_information, ) diff --git a/src/getml_io/serialize/roles.py b/src/getml_io/serialize/roles.py index 2fc160c..499140d 100644 --- a/src/getml_io/serialize/roles.py +++ b/src/getml_io/serialize/roles.py @@ -7,14 +7,14 @@ from getml_io.utils.convert import assume_is_str -def serialize_roles(roles: GetMLRoles) -> Roles: - """Serialize the roles of getML into a Roles object. +def convert_roles(roles: GetMLRoles) -> Roles: + """Convert the roles of getML into a Roles object. Args: - roles: The Roles object to serialize. + roles: The Roles object to convert. Returns: - Roles: The serialized roles information. + Roles: The converted roles information. """ return Roles( @@ -29,14 +29,14 @@ def serialize_roles(roles: GetMLRoles) -> Roles: ) -def serialize_role(role: GetMLRole) -> Role: - """Serialize a getML Role into a Role object. +def convert_role(role: GetMLRole) -> Role: + """Convert a getML Role into a Role object. Args: - role: The Role to serialize. + role: The Role to convert. Returns: - Role: The serialized role information. + Role: The converted role information. """ return Role(assume_is_str(role)) diff --git a/src/getml_io/serialize/table.py b/src/getml_io/serialize/table.py new file mode 100644 index 0000000..dbbecf4 --- /dev/null +++ b/src/getml_io/serialize/table.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import functools +from collections.abc import Callable, Sequence +from pathlib import Path + +import pyarrow as pa +import pyarrow.parquet as pq +from getml.data.roles.types import Role as GetMLRole + +from getml_io.metadata.dataframe_information import ( + DataFrameInformation, +) +from getml_io.serialize.column_information import ( + build_column_information_by_name_for_table, # pyright: ignore [reportUnknownVariableType] +) +from getml_io.serialize.parquet import serialize_object + + +def serialize_table( # noqa: PLR0913 + table: pa.Table, # pyright: ignore [reportUnknownParameterType, reportUnknownMemberType] + table_name: str, + get_getml_role_by_column: Callable[[str], GetMLRole], + column_names: Sequence[str], + *, + target_storage_directory: Path | None = None, + filename_prefix: str | None = None, +) -> DataFrameInformation: + """Serialize a PyArrow Table. + + Args: + table: The PyArrow Table to serialize. + table_name: The name of the table. + get_getml_role_by_column: A callable that returns the GetML role + for a given column name. + column_names: A sequence of column names in the table. + target_storage_directory: The directory where the serialized + table will be saved. If None, the table will not be saved to disk. + filename_prefix: An optional prefix to add to the filename of the + saved Parquet file. If None, no prefix is added. + + Returns: + DataFrameInformation: The serialized table information. + + """ + if target_storage_directory is None: + return _extract_dataframe_information( + table=table, # pyright: ignore [reportUnknownArgumentType] + dataframe_name=table_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + ) + + return serialize_object( + target_storage_directory=target_storage_directory, + save_parquet=functools.partial( + _save_table_as_parquet, + table=table, + ), + object_name=table_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + filename_prefix=filename_prefix, + ) + + +def _save_table_as_parquet(path: Path, table: pa.Table) -> None: # pyright: ignore [reportUnknownParameterType, reportUnknownMemberType] + """Save a PyArrow Table as a Parquet file.""" + pq.write_table(table, path) # pyright: ignore [reportUnknownMemberType, reportUnknownArgumentType] + + +def _extract_dataframe_information( + table: pa.Table, # pyright: ignore [reportUnknownMemberType, reportUnknownParameterType] + dataframe_name: str, + get_getml_role_by_column: Callable[[str], GetMLRole], + column_names: Sequence[str], +) -> DataFrameInformation: + column_information_by_name = build_column_information_by_name_for_table( + table=table, # pyright: ignore [reportUnknownArgumentType] + dataframe_name=dataframe_name, + get_getml_role_by_column=get_getml_role_by_column, + column_names=column_names, + ) + + return DataFrameInformation( + name=dataframe_name, + columns=column_information_by_name, + ) diff --git a/src/getml_io/utils/duckdb.py b/src/getml_io/utils/duckdb.py new file mode 100644 index 0000000..b0c6be3 --- /dev/null +++ b/src/getml_io/utils/duckdb.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import logging +from collections.abc import Mapping +from logging import Logger +from pathlib import Path +from typing import cast + +import duckdb +import pyarrow as pa +from getml.data import DataFrame, View + +logger: Logger = logging.getLogger(__name__) + +SUMMARIZE_PARQUET_TEMPLATE = "SUMMARIZE (SELECT * FROM read_parquet(?))" +SUMMARIZE_TABLE_STATEMENT = 'SUMMARIZE "table"' + +SummaryStatisticsType = Mapping[str, Mapping[str, str | int | float | None]] + + +def fetch_raw_summary_statistics_from_parquet( + parquet_filepath: Path, +) -> SummaryStatisticsType: + """Fetch raw summary statistics from a Parquet file. + + Args: + parquet_filepath: The path to the Parquet file. + + Returns: + A mapping from column names to their corresponding summary statistics. + + """ + logger.debug( + "Calculating summary statistics for Parquet '%s'", + parquet_filepath, + ) + return cast( + "SummaryStatisticsType", + duckdb.sql(SUMMARIZE_PARQUET_TEMPLATE, params=[str(parquet_filepath)]) # pyright: ignore [reportUnknownMemberType] + .df() + .set_index("column_name") + .to_dict(orient="index"), + ) + + +def fetch_raw_summary_statistics_for_dataframe( + dataframe_or_view: DataFrame | View, +) -> SummaryStatisticsType: + """Fetch raw summary statistics for a getML DataFrame or View. + + Args: + dataframe_or_view: A getML DataFrame or View. + + Returns: + A mapping from column names to their corresponding summary statistics. + + """ + dataframe_name = cast("str", dataframe_or_view.name) + logger.debug("Calculating summary statistics for DataFrame '%s'", dataframe_name) + with dataframe_or_view.to_arrow_stream() as stream: # pyright: ignore [reportUnknownVariableType] + summary_statistics = cast( + "SummaryStatisticsType", + duckdb.register("table", stream) # pyright: ignore [reportUnknownMemberType, reportUnknownArgumentType] + .sql(SUMMARIZE_TABLE_STATEMENT) + .df() + .set_index("column_name") + .to_dict(orient="index"), + ) + _ = duckdb.unregister("table") + return summary_statistics + + +def fetch_raw_summary_statistics_for_table( + table: pa.Table, # pyright: ignore [reportUnknownMemberType, reportUnknownParameterType] +) -> SummaryStatisticsType: + """Fetch raw summary statistics for a PyArrow Table. + + Args: + table: A PyArrow Table. + + Returns: + A mapping from column names to their corresponding summary statistics. + + """ + logger.debug("Calculating summary statistics for Table") + summary_statistics = cast( + "SummaryStatisticsType", + duckdb.register("table", table) # pyright: ignore [reportUnknownMemberType, reportUnknownArgumentType] + .sql(SUMMARIZE_TABLE_STATEMENT) + .df() + .set_index("column_name") + .to_dict(orient="index"), + ) + _ = duckdb.unregister("table") + return summary_statistics diff --git a/src/getml_io/utils/storage.py b/src/getml_io/utils/storage.py index bda0c14..0fde579 100644 --- a/src/getml_io/utils/storage.py +++ b/src/getml_io/utils/storage.py @@ -5,7 +5,7 @@ import platformdirs -from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.project_identification import ProjectIdentification from getml_io.utils.exception import ( StorageDirectoryCreationError, StorageDirectoryRemovalError, @@ -33,13 +33,13 @@ def get_default_root_storage_directory() -> Path: def get_target_storage_directory( - project_information: ProjectInformation, + project_identification: ProjectIdentification, root_storage_directory: Path, ) -> Path: """Return the target storage directory for the serialized project. Args: - project_information: The project information for the serialized project. + project_identification: The project identification for the serialized project. root_storage_directory: The directory where the serialized project will be saved. @@ -47,15 +47,15 @@ def get_target_storage_directory( The path to the target storage directory for the serialized project. """ - project_name = project_information.project_name - pipeline_id = project_information.pipeline_id - container_id = project_information.container_id + project_name = project_identification.project_name + pipeline_id = project_identification.pipeline_id + container_id = project_identification.container_id target_name: str = f"{project_name}_{pipeline_id}_{container_id}" return root_storage_directory / target_name def create_target_storage_directory( - project_information: ProjectInformation, + project_identification: ProjectIdentification, root_storage_directory: Path, *, clear_storage_directory: bool, @@ -63,7 +63,7 @@ def create_target_storage_directory( """Create the target storage directory for the serialized project. Args: - project_information: The project information for the serialized project. + project_identification: The project identification for the serialized project. root_storage_directory: The directory where the serialized project will be saved. clear_storage_directory: Whether to clear the directory before serialization. @@ -78,7 +78,7 @@ def create_target_storage_directory( """ target_storage_directory: Path = get_target_storage_directory( - project_information, + project_identification, root_storage_directory, ) if not target_storage_directory.exists(): diff --git a/tests/integration/assertions.py b/tests/integration/assertions.py index b295d95..9e3f1be 100644 --- a/tests/integration/assertions.py +++ b/tests/integration/assertions.py @@ -3,10 +3,10 @@ from getml_io.getml.features import Features from getml_io.getml.scores import Scores +from getml_io.metadata.column_statistics import ColumnStatistics from getml_io.metadata.container_information import ContainerInformation from getml_io.metadata.dataframe_information import ( ColumnInformation, - ColumnStatistics, DataFrameInformation, ) from getml_io.metadata.pipeline_information import PipelineInformation @@ -101,7 +101,6 @@ def assert_dataframe_information( assert expected_dataframe_information is not None assert dataframe_information.name == expected_dataframe_information.name - assert dataframe_information.path == expected_dataframe_information.path assert ( dataframe_information.columns.keys() diff --git a/tests/integration/data/loans/expected.container.json b/tests/integration/data/loans/expected.container.json index aa57349..5de5e0d 100644 --- a/tests/integration/data/loans/expected.container.json +++ b/tests/integration/data/loans/expected.container.json @@ -4,7 +4,6 @@ "peripheral": { "meta": { "name": "meta", - "path": "container/peripheral/meta.parquet", "columns": { "account_id": { "name": "account_id", @@ -420,7 +419,6 @@ }, "order": { "name": "order", - "path": "container/peripheral/order.parquet", "columns": { "account_id": { "name": "account_id", @@ -519,7 +517,6 @@ }, "trans": { "name": "trans", - "path": "container/peripheral/trans.parquet", "columns": { "date": { "name": "date", @@ -682,7 +679,6 @@ "subsets": { "train": { "name": "train", - "path": "container/subsets/train.parquet", "columns": { "date_loan": { "name": "date_loan", @@ -866,7 +862,6 @@ }, "test": { "name": "test", - "path": "container/subsets/test.parquet", "columns": { "date_loan": { "name": "date_loan", diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json index ab374d1..40739e7 100644 --- a/tests/integration/data/loans/expected.pipeline.json +++ b/tests/integration/data/loans/expected.pipeline.json @@ -3,7 +3,6 @@ "predictions": { "train": { "name": "prediction.train", - "path": "pipeline/predictions/prediction.train.parquet", "columns": { "default": { "name": "default", @@ -27,7 +26,6 @@ }, "test": { "name": "prediction.test", - "path": "pipeline/predictions/prediction.test.parquet", "columns": { "default": { "name": "default", @@ -53,7 +51,6 @@ "feature_sets": { "train": { "name": "features.train", - "path": "pipeline/feature_sets/features.train.parquet", "columns": { "date_loan": { "name": "date_loan", @@ -341,7 +338,6 @@ }, "test": { "name": "features.test", - "path": "pipeline/feature_sets/features.test.parquet", "columns": { "date_loan": { "name": "date_loan", @@ -1386,4 +1382,4 @@ "importance": 0.330471410273217 } ] -} \ No newline at end of file +} diff --git a/tests/integration/data/numerical/expected.container.json b/tests/integration/data/numerical/expected.container.json index 5ce170d..ad3fb31 100644 --- a/tests/integration/data/numerical/expected.container.json +++ b/tests/integration/data/numerical/expected.container.json @@ -4,7 +4,6 @@ "peripheral": { "perph": { "name": "perph", - "path": "container/peripheral/perph.parquet", "columns": { "time_stamp": { "name": "time_stamp", @@ -61,7 +60,6 @@ "subsets": { "train": { "name": "train", - "path": "container/subsets/train.parquet", "columns": { "time_stamp": { "name": "time_stamp", @@ -134,7 +132,6 @@ }, "test": { "name": "test", - "path": "container/subsets/test.parquet", "columns": { "time_stamp": { "name": "time_stamp", diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json index 9b148de..71b5a76 100644 --- a/tests/integration/data/numerical/expected.pipeline.json +++ b/tests/integration/data/numerical/expected.pipeline.json @@ -3,7 +3,6 @@ "predictions": { "train": { "name": "prediction.train", - "path": "pipeline/predictions/prediction.train.parquet", "columns": { "targets": { "name": "targets", @@ -27,7 +26,6 @@ }, "test": { "name": "prediction.test", - "path": "pipeline/predictions/prediction.test.parquet", "columns": { "targets": { "name": "targets", @@ -53,7 +51,6 @@ "feature_sets": { "train": { "name": "features.train", - "path": "pipeline/feature_sets/features.train.parquet", "columns": { "time_stamp": { "name": "time_stamp", @@ -305,7 +302,6 @@ }, "test": { "name": "features.test", - "path": "pipeline/feature_sets/features.test.parquet", "columns": { "time_stamp": { "name": "time_stamp", @@ -930,4 +926,4 @@ "importance": 0.7635181706678932 } ] -} \ No newline at end of file +} diff --git a/tests/integration/data/robot/expected.container.json b/tests/integration/data/robot/expected.container.json index a34f846..761d320 100644 --- a/tests/integration/data/robot/expected.container.json +++ b/tests/integration/data/robot/expected.container.json @@ -4,7 +4,6 @@ "peripheral": { "full": { "name": "full", - "path": "container/peripheral/full.parquet", "columns": { "rowid": { "name": "rowid", @@ -1758,7 +1757,6 @@ "subsets": { "train": { "name": "full", - "path": "container/subsets/train.full.parquet", "columns": { "rowid": { "name": "rowid", @@ -3510,7 +3508,6 @@ }, "validation": { "name": "full", - "path": "container/subsets/validation.full.parquet", "columns": { "rowid": { "name": "rowid", @@ -5262,7 +5259,6 @@ }, "test": { "name": "full", - "path": "container/subsets/test.full.parquet", "columns": { "rowid": { "name": "rowid", diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json index a1cb26e..cb6b296 100644 --- a/tests/integration/data/robot/expected.pipeline.json +++ b/tests/integration/data/robot/expected.pipeline.json @@ -3,7 +3,6 @@ "predictions": { "train": { "name": "prediction.train", - "path": "pipeline/predictions/prediction.train.parquet", "columns": { "f_x": { "name": "f_x", @@ -63,7 +62,6 @@ }, "validation": { "name": "prediction.validation", - "path": "pipeline/predictions/prediction.validation.parquet", "columns": { "f_x": { "name": "f_x", @@ -123,7 +121,6 @@ }, "test": { "name": "prediction.test", - "path": "pipeline/predictions/prediction.test.parquet", "columns": { "f_x": { "name": "f_x", @@ -185,7 +182,6 @@ "feature_sets": { "train": { "name": "features.train", - "path": "pipeline/feature_sets/features.train.parquet", "columns": { "f_x": { "name": "f_x", @@ -2459,7 +2455,6 @@ }, "validation": { "name": "features.validation", - "path": "pipeline/feature_sets/features.validation.parquet", "columns": { "f_x": { "name": "f_x", @@ -4733,7 +4728,6 @@ }, "test": { "name": "features.test", - "path": "pipeline/feature_sets/features.test.parquet", "columns": { "f_x": { "name": "f_x", @@ -11585,4 +11579,4 @@ "importance": 0.470964668931741 } ] -} \ No newline at end of file +} diff --git a/tests/integration/test_serialize_cora.py b/tests/integration/test_serialize_cora.py index 39cb89b..17f18bf 100644 --- a/tests/integration/test_serialize_cora.py +++ b/tests/integration/test_serialize_cora.py @@ -152,3 +152,15 @@ def _expected_pipeline_json( }, }, } + + +# TODO @urfoex: #45: enable this test once the issue is fixed +@pytest.mark.skip("Needs fix for getml: https://github.com/getml/monorepo/pull/418") +@pytest.mark.integration +def test_serialize_project_cora( + cora_project: CoraProject, + tmp_path: Path, +) -> None: + # Given + _ = cora_project + _ = tmp_path diff --git a/tests/integration/test_serialize_loans.py b/tests/integration/test_serialize_loans.py index 85e1d74..0efc7df 100644 --- a/tests/integration/test_serialize_loans.py +++ b/tests/integration/test_serialize_loans.py @@ -4,6 +4,8 @@ from typer.testing import CliRunner from getml_io.cli import app +from getml_io.getml.project_identification import ProjectIdentification +from getml_io.serialize.project import serialize_project from getml_io.utils.convert import assume_is_str from tests.integration.assertions import ( assert_container_information, @@ -97,3 +99,45 @@ def test_serialize_loans( "prediction.test.parquet", ], ) + + +@pytest.mark.integration +def test_serialize_project_loans( + loans_project: LoansProject, + tmp_path: Path, + data_path: Path, +) -> None: + # Given + root_storage_directory = tmp_path + + # When + project_information = serialize_project( + project_identification=ProjectIdentification( + project_name=loans_project.name, + pipeline_id=loans_project.pipeline.id, + container_id=assume_is_str(loans_project.container.id), + ), + root_storage_directory=None, + clear_storage_directory=False, + ) + + # Then + assert project_information + + # Then + subdirectories = list(root_storage_directory.iterdir()) + assert len(subdirectories) == 0 + + # Then - Container + expected_container_information = load_container_information( + data_path / "loans" / "expected.container.json", + ) + container_information = project_information.container_information + assert_container_information(container_information, expected_container_information) + + # Then - Pipeline + expected_pipeline_information = load_pipeline_information( + data_path / "loans" / "expected.pipeline.json", + ) + pipeline_information = project_information.pipeline_information + assert_pipeline_information(pipeline_information, expected_pipeline_information) diff --git a/tests/integration/test_serialize_numerical.py b/tests/integration/test_serialize_numerical.py index 15290b1..59bdbfe 100644 --- a/tests/integration/test_serialize_numerical.py +++ b/tests/integration/test_serialize_numerical.py @@ -4,6 +4,8 @@ from typer.testing import CliRunner from getml_io.cli import app +from getml_io.getml.project_identification import ProjectIdentification +from getml_io.serialize.project import serialize_project from getml_io.utils.convert import assume_is_str from tests.integration.assertions import ( assert_container_information, @@ -95,3 +97,45 @@ def test_serialize_numerical( "prediction.test.parquet", ], ) + + +@pytest.mark.integration +def test_serialize_project_numerical( + numerical_project: NumericalProject, + tmp_path: Path, + data_path: Path, +) -> None: + # Given + root_storage_directory = tmp_path + + # When + project_information = serialize_project( + project_identification=ProjectIdentification( + project_name=numerical_project.name, + pipeline_id=numerical_project.pipeline.id, + container_id=assume_is_str(numerical_project.container.id), + ), + root_storage_directory=None, + clear_storage_directory=False, + ) + + # Then + assert project_information + + # Then + subdirectories = list(root_storage_directory.iterdir()) + assert len(subdirectories) == 0 + + # Then - Container + expected_container_information = load_container_information( + data_path / "numerical" / "expected.container.json", + ) + container_information = project_information.container_information + assert_container_information(container_information, expected_container_information) + + # Then - Pipeline + expected_pipeline_information = load_pipeline_information( + data_path / "numerical" / "expected.pipeline.json", + ) + pipeline_information = project_information.pipeline_information + assert_pipeline_information(pipeline_information, expected_pipeline_information) diff --git a/tests/integration/test_serialize_robot.py b/tests/integration/test_serialize_robot.py index 9cfb766..0869720 100644 --- a/tests/integration/test_serialize_robot.py +++ b/tests/integration/test_serialize_robot.py @@ -4,6 +4,8 @@ from typer.testing import CliRunner from getml_io.cli import app +from getml_io.getml.project_identification import ProjectIdentification +from getml_io.serialize.project import serialize_project from getml_io.utils.convert import assume_is_str from tests.integration.assertions import ( assert_container_information, @@ -98,3 +100,45 @@ def test_serialize_robot( "prediction.validation.parquet", ], ) + + +@pytest.mark.integration +def test_serialize_project_robot( + robot_project: RobotProject, + tmp_path: Path, + data_path: Path, +) -> None: + # Given + root_storage_directory = tmp_path + + # When + project_information = serialize_project( + project_identification=ProjectIdentification( + project_name=robot_project.name, + pipeline_id=robot_project.pipeline.id, + container_id=assume_is_str(robot_project.container.id), + ), + root_storage_directory=None, + clear_storage_directory=False, + ) + + # Then + assert project_information + + # Then + subdirectories = list(root_storage_directory.iterdir()) + assert len(subdirectories) == 0 + + # Then - Container + expected_container_information = load_container_information( + data_path / "robot" / "expected.container.json", + ) + container_information = project_information.container_information + assert_container_information(container_information, expected_container_information) + + # Then - Pipeline + expected_pipeline_information = load_pipeline_information( + data_path / "robot" / "expected.pipeline.json", + ) + pipeline_information = project_information.pipeline_information + assert_pipeline_information(pipeline_information, expected_pipeline_information) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index d60e3cd..b919761 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -2,16 +2,16 @@ import re from collections.abc import ( Mapping, + MutableSequence, Sequence, ) from datetime import datetime, timezone from pathlib import Path -from typing import Any, Protocol +from typing import Protocol import getml.data.roles as getml_roles import numpy as np import pandas as pd -import pyarrow as pa import pytest import pytest_mock from duckdb import DuckDBPyConnection @@ -46,17 +46,19 @@ from getml_io.getml.predictors import LinearRegression from getml_io.getml.preprocessors import CategoryTrimmer from getml_io.getml.project import Project -from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.project_identification import ProjectIdentification from getml_io.getml.relationships import Relationship from getml_io.getml.roles import Role, Roles from getml_io.getml.scores import ClassificationScore, Scores from getml_io.getml.tables import Table +from getml_io.metadata.column_statistics import ( + ColumnStatisticsNumerical, + ColumnType, +) from getml_io.metadata.container_information import ContainerInformation from getml_io.metadata.data_model_information import DataModelInformation from getml_io.metadata.dataframe_information import ( ColumnInformation, - ColumnStatisticsNumerical, - ColumnType, DataFrameInformation, ) from getml_io.metadata.pipeline_information import LossFunction, PipelineInformation @@ -148,8 +150,8 @@ def __getitem__(self, key: str) -> Subset: ... @pytest.fixture -def project_information() -> ProjectInformation: - return ProjectInformation( +def project_identification() -> ProjectIdentification: + return ProjectIdentification( project_name="test_project", pipeline_id="test_pipeline_id", container_id="test_container_id", @@ -211,11 +213,9 @@ def subsets_path(container_path: Path) -> Path: @pytest.fixture def dataframe_information_population( column_information_default: ColumnInformation, - population_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="population", - path=population_path / "population.parquet", columns={"default": column_information_default}, ) @@ -238,23 +238,19 @@ def feature_sets_path(pipeline_path: Path) -> Path: @pytest.fixture def dataframe_information_peripheral( column_information_default: ColumnInformation, - peripheral_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="peripheral", - path=peripheral_path / "peripheral.parquet", columns={"default": column_information_default}, ) @pytest.fixture def dataframe_information_subset( - tmp_path: Path, column_information_default: ColumnInformation, ) -> DataFrameInformation: return DataFrameInformation( name="subset", - path=tmp_path / "subset.parquet", columns={"default": column_information_default}, ) @@ -317,6 +313,8 @@ def mock_dataframe( ] dataframe.roles = mocker.MagicMock() dataframe.roles.column = _role_by_name + dataframe.to_arrow_stream = mocker.MagicMock() + dataframe.to_arrow_stream.__enter__.return_value = mocker.MagicMock() # pyright: ignore [reportAny] return dataframe @@ -330,11 +328,9 @@ def mock_dataframe_train(mock_dataframe: DataFrame) -> DataFrame: @pytest.fixture def dataframe_information_train( column_information_default: ColumnInformation, - subsets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_train", - path=subsets_path / "dataframe_train.parquet", columns={"default": column_information_default}, ) @@ -342,11 +338,9 @@ def dataframe_information_train( @pytest.fixture def dataframe_information_test( column_information_default: ColumnInformation, - subsets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_test", - path=subsets_path / "dataframe_test.parquet", columns={"default": column_information_default}, ) @@ -354,11 +348,9 @@ def dataframe_information_test( @pytest.fixture def dataframe_information_validation( column_information_default: ColumnInformation, - subsets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_validation", - path=subsets_path / "dataframe_validation.parquet", columns={"default": column_information_default}, ) @@ -379,12 +371,10 @@ def mock_dataframe_validation(mock_dataframe: DataFrame) -> DataFrame: @pytest.fixture def dataframe_information( - tmp_path: Path, column_information_default: ColumnInformation, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_name", - path=tmp_path / "dataframe_name.parquet", columns={"default": column_information_default}, ) @@ -625,11 +615,9 @@ def mock_project_empty( @pytest.fixture def dataframe_information_features_test( column_information_default: ColumnInformation, - feature_sets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="features.test", - path=feature_sets_path / "features.test.parquet", columns={"default": column_information_default}, ) @@ -637,11 +625,9 @@ def dataframe_information_features_test( @pytest.fixture def dataframe_information_features_validation( column_information_default: ColumnInformation, - feature_sets_path: Path, ) -> DataFrameInformation: return DataFrameInformation( name="features.validation", - path=feature_sets_path / "features.validation.parquet", columns={"default": column_information_default}, ) @@ -791,19 +777,10 @@ def columns() -> list[Column]: def predictions( dataframe_information_test: DataFrameInformation, dataframe_information_validation: DataFrameInformation, - predictions_path: Path, ) -> dict[str, DataFrameInformation]: return { - "test": dataframe_information_test.model_copy( - update={ - "path": predictions_path / dataframe_information_test.path.name, - }, - ), - "validation": dataframe_information_validation.model_copy( - update={ - "path": predictions_path / dataframe_information_validation.path.name, - }, - ), + "test": dataframe_information_test, + "validation": dataframe_information_validation, } @@ -811,21 +788,10 @@ def predictions( def feature_sets( dataframe_information_features_test: DataFrameInformation, dataframe_information_features_validation: DataFrameInformation, - feature_sets_path: Path, ) -> dict[str, DataFrameInformation]: return { - "test": dataframe_information_features_test.model_copy( - update={ - "path": feature_sets_path - / dataframe_information_features_test.path.name, - }, - ), - "validation": dataframe_information_features_validation.model_copy( - update={ - "path": feature_sets_path - / dataframe_information_features_validation.path.name, - }, - ), + "test": dataframe_information_features_test, + "validation": dataframe_information_features_validation, } @@ -967,71 +933,48 @@ def _generate_raw_summary_statistics_pd(dataframe: DataFrame | View) -> pd.DataF return pd.DataFrame(data) -class MockDuckDBExecuteFactory(Protocol): - def __call__(self, dataframes_by_path: Mapping[Path, DataFrame | View]) -> None: ... +class MockDuckDBSQLFactory(Protocol): + def __call__(self, dataframes: MutableSequence[DataFrame | View]) -> None: ... REGEX_READ_PARQUET = re.compile(r"read_parquet\(\"(?P[^\"]+)\"\)") @pytest.fixture -def mock_duckdb_execute_factory( +def mock_duckdb_sql_factory( mocker: pytest_mock.MockerFixture, - tmp_path: Path, -) -> MockDuckDBExecuteFactory: - def mock_duckdb_execute( - dataframes_by_path: Mapping[Path, DataFrame | View], +) -> MockDuckDBSQLFactory: + def mock_duckdb_sql( + dataframes: MutableSequence[DataFrame | View], ) -> None: - connection_context_manager = mocker.MagicMock(DuckDBPyConnection) - connection = mocker.MagicMock(DuckDBPyConnection) - connection_context_manager.__enter__.return_value = connection # pyright: ignore [reportAny] + mock_connection = mocker.MagicMock(DuckDBPyConnection) _ = mocker.patch( - "getml_io.serialize.parquet.duckdb.connect", - return_value=connection_context_manager, + "getml_io.utils.duckdb.duckdb.register", + return_value=mock_connection, ) - def mocked_execute( + def mocked_sql( _query: str, - parameters: Sequence[str], + params: Sequence[str] | None = None, ) -> DuckDBPyConnection: - assert len(parameters) == 1 - current_parquet_path = Path(parameters[0]) - mock_execution = mocker.MagicMock(DuckDBPyConnection) + _ = params + mock_sql = mocker.MagicMock(DuckDBPyConnection) def mocked_df() -> pd.DataFrame: - current_dataframe = dataframes_by_path[ - current_parquet_path.relative_to(tmp_path) - ] + current_dataframe = dataframes.pop(0) return _generate_raw_summary_statistics_pd(current_dataframe) - mock_execution.df.side_effect = mocked_df # pyright: ignore [reportAny] - return mock_execution - - connection.execute.side_effect = mocked_execute # pyright: ignore [reportAny] + mock_sql.df.side_effect = mocked_df # pyright: ignore [reportAny] + return mock_sql - return mock_duckdb_execute + mock_connection.sql.side_effect = mocked_sql # pyright: ignore [reportAny] + _ = mocker.patch( + "getml_io.utils.duckdb.duckdb.sql", + side_effect=mocked_sql, + ) -@pytest.fixture -def mock_getml_dataframe_from_array( - mocker: pytest_mock.MockerFixture, - mock_dataframe: DataFrame, -) -> None: - def mock_from_array( - table: pa.Table, # pyright: ignore [reportUnknownMemberType, reportUnknownParameterType] - name: str, - *_args: tuple[Any, ...], # pyright: ignore [reportExplicitAny] - **_kwargs: dict[str, Any], # pyright: ignore [reportExplicitAny] - ) -> DataFrame: - _ = table # pyright: ignore [reportUnknownVariableType] - dataframe = copy.deepcopy(mock_dataframe) - dataframe.name = name - return dataframe - - _ = mocker.patch( - "getml_io.serialize.pipeline.DataFrame.from_arrow", - side_effect=mock_from_array, - ) + return mock_duckdb_sql def build_statistics_varchar(column_type: str) -> StatisticsType: diff --git a/tests/unit/metadata/test_container_information.py b/tests/unit/metadata/test_container_information.py index a80657e..4c0d079 100644 --- a/tests/unit/metadata/test_container_information.py +++ b/tests/unit/metadata/test_container_information.py @@ -1,5 +1,3 @@ -from pathlib import Path - import pytest from getml_io.getml.roles import Role @@ -33,29 +31,18 @@ def _get_expected_empty_container_information() -> ContainerInformationType: @pytest.mark.unit def test_serialize_model( container_information: ContainerInformation, - population_path: Path, - peripheral_path: Path, - subsets_path: Path, ) -> None: # When serialized_model = ContainerInformation.model_dump(container_information) # Then expected_serialized_container_information = ( - _get_expected_serialized_container_information( - population_path, - peripheral_path, - subsets_path, - ) + _get_expected_serialized_container_information() ) assert serialized_model == expected_serialized_container_information -def _get_expected_serialized_container_information( - population_path: Path, - peripheral_path: Path, - subsets_path: Path, -) -> ContainerInformationType: +def _get_expected_serialized_container_information() -> ContainerInformationType: expected_column_information_by_name: ColumnInformationType = { "default": { "name": "default", @@ -80,30 +67,25 @@ def _get_expected_serialized_container_information( "id": "container_id", "population": { "name": "population", - "path": population_path / "population.parquet", "columns": expected_column_information_by_name, }, "peripheral": { "peripheral": { "name": "peripheral", - "path": peripheral_path / "peripheral.parquet", "columns": expected_column_information_by_name, }, }, "subsets": { "train": { "name": "dataframe_train", - "path": subsets_path / "dataframe_train.parquet", "columns": expected_column_information_by_name, }, "test": { "name": "dataframe_test", - "path": subsets_path / "dataframe_test.parquet", "columns": expected_column_information_by_name, }, "validation": { "name": "dataframe_validation", - "path": subsets_path / "dataframe_validation.parquet", "columns": expected_column_information_by_name, }, }, diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index 0fbb0ce..c17a7c8 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -1,5 +1,4 @@ from datetime import datetime, timezone -from pathlib import Path import pytest @@ -112,24 +111,20 @@ def _get_expected_serialized_pipeline_information() -> PipelineInformationType: "predictions": { "test": { "name": "dataframe_test", - "path": Path("pipeline/predictions/dataframe_test.parquet"), "columns": expected_column_information_by_name, }, "validation": { "name": "dataframe_validation", - "path": Path("pipeline/predictions/dataframe_validation.parquet"), "columns": expected_column_information_by_name, }, }, "feature_sets": { "test": { "name": "features.test", - "path": Path("pipeline/feature_sets/features.test.parquet"), "columns": expected_column_information_by_name, }, "validation": { "name": "features.validation", - "path": Path("pipeline/feature_sets/features.validation.parquet"), "columns": expected_column_information_by_name, }, }, diff --git a/tests/unit/serialize/test_column_information.py b/tests/unit/serialize/test_column_information.py new file mode 100644 index 0000000..5820aea --- /dev/null +++ b/tests/unit/serialize/test_column_information.py @@ -0,0 +1,217 @@ +from pathlib import Path + +import getml.data.roles as getml_roles +import pyarrow as pa +import pytest +from getml.data import DataFrame +from getml.data.roles.types import Role as GetMLRole +from pyarrow import parquet as pq + +from getml_io.getml.roles import Role +from getml_io.metadata.column_statistics import ( + ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, + ColumnStatistics, + ColumnStatisticsCategorical, + ColumnStatisticsJoinKey, + ColumnStatisticsNumerical, + ColumnStatisticsTarget, + ColumnStatisticsText, + ColumnStatisticsTimeStamp, + ColumnStatisticsTimeStampAsFloat, + ColumnStatisticsUnusedFloat, + ColumnStatisticsUnusedString, + ColumnType, +) +from getml_io.serialize.column_information import ( + _get_column_statistics_type, # pyright: ignore[reportPrivateUsage] + build_column_information_by_name_for_dataframe, + build_column_information_by_name_for_parquet, + build_column_information_by_name_for_table, # pyright: ignore[reportUnknownVariableType] +) +from getml_io.serialize.exception import ( + UnsupportedColumnStatisticsError, +) +from getml_io.serialize.roles import convert_role +from getml_io.utils.convert import assume_is_str +from tests.unit.conftest import ( + MockDuckDBSQLFactory, + column_information_by_name_to_json, + get_expected_column_information_by_name, +) + + +@pytest.mark.unit +@pytest.mark.parametrize( + ("getml_role", "column_type"), + [ + (getml_roles.categorical, ColumnType.DOUBLE), + (getml_roles.categorical, ColumnType.TIMESTAMP_NS), + (getml_roles.join_key, ColumnType.DOUBLE), + (getml_roles.join_key, ColumnType.TIMESTAMP_NS), + (getml_roles.text, ColumnType.DOUBLE), + (getml_roles.text, ColumnType.TIMESTAMP_NS), + (getml_roles.unused_string, ColumnType.DOUBLE), + (getml_roles.unused_string, ColumnType.TIMESTAMP_NS), + (getml_roles.numerical, ColumnType.VARCHAR), + (getml_roles.numerical, ColumnType.TIMESTAMP_NS), + (getml_roles.target, ColumnType.VARCHAR), + (getml_roles.target, ColumnType.TIMESTAMP_NS), + (getml_roles.time_stamp, ColumnType.VARCHAR), + (getml_roles.unused_float, ColumnType.VARCHAR), + (getml_roles.unused_float, ColumnType.TIMESTAMP_NS), + ], +) +def test__get_column_statistics_type_error( + getml_role: GetMLRole, + column_type: ColumnType, +) -> None: + # Given + column_name = "TestColumn" + dataframe_name = "mock_dataframe_name" + role = convert_role(getml_role) + + # When / Then + with pytest.raises( + UnsupportedColumnStatisticsError, + match=( + rf"Column '{column_name}' in dataframe '{dataframe_name}' " + rf"has an unsupported role: {role!r} and type: {column_type.value}" + ), + ): + _ = _get_column_statistics_type( + dataframe_name, + column_name, + getml_role, + column_type.value, + ) + + assert (role, column_type) not in ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING + + +@pytest.mark.unit +@pytest.mark.parametrize( + ("getml_role", "column_type", "expected_column_statistics_type"), + [ + (Role.CATEGORICAL, ColumnType.VARCHAR, ColumnStatisticsCategorical), + (Role.JOIN_KEY, ColumnType.VARCHAR, ColumnStatisticsJoinKey), + (Role.NUMERICAL, ColumnType.DOUBLE, ColumnStatisticsNumerical), + (Role.TARGET, ColumnType.DOUBLE, ColumnStatisticsTarget), + (Role.TIME_STAMP, ColumnType.TIMESTAMP_NS, ColumnStatisticsTimeStamp), + (Role.TIME_STAMP, ColumnType.DOUBLE, ColumnStatisticsTimeStampAsFloat), + (Role.TEXT, ColumnType.VARCHAR, ColumnStatisticsText), + (Role.UNUSED_FLOAT, ColumnType.DOUBLE, ColumnStatisticsUnusedFloat), + (Role.UNUSED_STRING, ColumnType.VARCHAR, ColumnStatisticsUnusedString), + ], +) +def test__get_column_statistics_type( + getml_role: GetMLRole, + column_type: ColumnType, + expected_column_statistics_type: type[ColumnStatistics], +) -> None: + # Given + column_name = "TestColumn" + dataframe_name = "mock_dataframe_name" + role = convert_role(getml_role) + + # When + column_statistics_type = _get_column_statistics_type( + dataframe_name, + column_name, + getml_role, + column_type.value, + ) + + # Then + assert column_statistics_type is expected_column_statistics_type + assert (role, column_type) in ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING + + +@pytest.mark.unit +def test_build_column_information_by_name_for_dataframe( + mock_dataframe: DataFrame, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, +) -> None: + # Given + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], + ) + + # When + column_information_by_name = build_column_information_by_name_for_dataframe( + mock_dataframe, + ) + + # Then + expected_column_information_by_name = get_expected_column_information_by_name() + assert column_information_by_name_to_json( + column_information_by_name, + ) == column_information_by_name_to_json(expected_column_information_by_name) + + +@pytest.mark.unit +def test_build_column_information_by_name_for_parquet( + tmp_path: Path, + mock_dataframe: DataFrame, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, +) -> None: + # Given + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], + ) + target_filepath = tmp_path / "mock_dataframe_name.parquet" + table: pa.Table = pa.Table.from_pydict( # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + { + "mocked": [], + }, + ) + pq.write_table(table, target_filepath) # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType] + + # When + column_information_by_name = build_column_information_by_name_for_parquet( + parquet_filepath=target_filepath, + dataframe_name=assume_is_str(mock_dataframe.name), + get_getml_role_by_column=mock_dataframe.roles.column, + column_names=mock_dataframe.columns, + ) + + # Then + expected_column_information_by_name = get_expected_column_information_by_name() + assert column_information_by_name_to_json( + column_information_by_name, + ) == column_information_by_name_to_json(expected_column_information_by_name) + + +@pytest.mark.unit +def test_build_column_information_by_name_for_table( + mock_dataframe: DataFrame, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, +) -> None: + # Given + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], + ) + table: pa.Table = pa.Table.from_pydict( # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + { + "mocked": [], + }, + ) + + # When + column_information_by_name = build_column_information_by_name_for_table( + table, # pyright: ignore[reportUnknownArgumentType] + dataframe_name=assume_is_str(mock_dataframe.name), + get_getml_role_by_column=mock_dataframe.roles.column, + column_names=mock_dataframe.columns, + ) + + # Then + expected_column_information_by_name = get_expected_column_information_by_name() + assert column_information_by_name_to_json( + column_information_by_name, + ) == column_information_by_name_to_json(expected_column_information_by_name) diff --git a/tests/unit/serialize/test_container.py b/tests/unit/serialize/test_container.py index 2a91118..58b30e7 100644 --- a/tests/unit/serialize/test_container.py +++ b/tests/unit/serialize/test_container.py @@ -1,8 +1,9 @@ import json +from collections.abc import MutableSequence from pathlib import Path import pytest -from getml.data import Container +from getml.data import Container, DataFrame, View from getml_io.serialize.container import serialize_container from getml_io.utils.convert import ( @@ -10,7 +11,7 @@ assume_is_optional_dataframe_or_view, ) from tests.unit.conftest import ( - MockDuckDBExecuteFactory, + MockDuckDBSQLFactory, build_column_information_by_name, ) from tests.unit.types import ( @@ -20,43 +21,29 @@ @pytest.mark.unit -def test_serialize_container( # noqa: PLR0913 +def test_serialize_container( mock_container: Container, tmp_path: Path, - mock_duckdb_execute_factory: MockDuckDBExecuteFactory, - population_path: Path, - peripheral_path: Path, - subsets_path: Path, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, ) -> None: # Given target_storage_directory = tmp_path population = assume_is_optional_dataframe_or_view(mock_container.population) - mock_duckdb_execute_factory( - { - **( - {population_path / f"{population.name}.parquet": population} - if population is not None - else {} - ), - **{ - peripheral_path / f"{peripheral.name}.parquet": peripheral - for peripheral in assume_is_dict_str_to_dataframe_or_view( - mock_container.peripheral, - ).values() - }, - **{ - subsets_path / f"{subset_name}.{subset.name}.parquet": subset - for subset_name, subset in assume_is_dict_str_to_dataframe_or_view( - mock_container.subsets, - ).items() - }, - }, + duckdb_sql_entries: MutableSequence[DataFrame | View] = [] + if population is not None: + duckdb_sql_entries.append(population) + duckdb_sql_entries.extend( + assume_is_dict_str_to_dataframe_or_view(mock_container.peripheral).values(), ) + duckdb_sql_entries.extend( + assume_is_dict_str_to_dataframe_or_view(mock_container.subsets).values(), + ) + mock_duckdb_sql_factory(duckdb_sql_entries) # When - container_information_json_path, container_information = serialize_container( - mock_container, - target_storage_directory, + container_information = serialize_container( + container=mock_container, + target_storage_directory=target_storage_directory, ) # Then @@ -64,58 +51,36 @@ def test_serialize_container( # noqa: PLR0913 target_storage_directory / "container.json" ) assert expected_container_information_json_path.exists() - assert container_information_json_path == expected_container_information_json_path assert container_information.id == "mock_container_id" assert container_information.population is not None assert container_information.population.name == "mock_population_name" - assert container_information.population.path == ( - population_path / "mock_population_name.parquet" - ) assert len(container_information.peripheral) == 1 peripheral_information = container_information.peripheral["mock_peripheral_name"] assert peripheral_information.name == "mock_peripheral_name" - assert peripheral_information.path == ( - peripheral_path / "mock_peripheral_name.parquet" - ) expected_number_of_subsets = 3 assert len(container_information.subsets) == expected_number_of_subsets train_information = container_information.subsets["train"] assert train_information.name == "mock_dataframe_train" - assert train_information.path == ( - subsets_path / "train.mock_dataframe_train.parquet" - ) test_information = container_information.subsets["test"] assert test_information.name == "mock_dataframe_test" - assert test_information.path == subsets_path / "test.mock_dataframe_test.parquet" validation_information = container_information.subsets["validation"] assert validation_information.name == "mock_dataframe_validation" - assert validation_information.path == ( - subsets_path / "validation.mock_dataframe_validation.parquet" - ) assert not container_information.deep_copy - expected_container_json_content = _get_expected_container_information( - population_path, - peripheral_path, - subsets_path, - ) + expected_container_json_content = _get_expected_container_information() container_json = expected_container_information_json_path.read_text() assert json.loads(container_json) == expected_container_json_content -def _get_expected_container_information( - population_path: Path, - peripheral_path: Path, - subsets_path: Path, -) -> ContainerInformationType: +def _get_expected_container_information() -> ContainerInformationType: expected_column_information_by_name: ColumnInformationType = ( build_column_information_by_name() ) @@ -123,32 +88,25 @@ def _get_expected_container_information( "id": "mock_container_id", "population": { "name": "mock_population_name", - "path": str(population_path / "mock_population_name.parquet"), "columns": expected_column_information_by_name, }, "peripheral": { "mock_peripheral_name": { "name": "mock_peripheral_name", - "path": str(peripheral_path / "mock_peripheral_name.parquet"), "columns": expected_column_information_by_name, }, }, "subsets": { "test": { "name": "mock_dataframe_test", - "path": str(subsets_path / "test.mock_dataframe_test.parquet"), "columns": expected_column_information_by_name, }, "train": { "name": "mock_dataframe_train", - "path": str(subsets_path / "train.mock_dataframe_train.parquet"), "columns": expected_column_information_by_name, }, "validation": { "name": "mock_dataframe_validation", - "path": str( - subsets_path / "validation.mock_dataframe_validation.parquet", - ), "columns": expected_column_information_by_name, }, }, @@ -165,9 +123,9 @@ def test_serialize_container_empty( target_storage_directory = tmp_path # When - container_information_json_path, container_information = serialize_container( - mock_container_empty, - target_storage_directory, + container_information = serialize_container( + container=mock_container_empty, + target_storage_directory=target_storage_directory, ) # Then @@ -175,7 +133,6 @@ def test_serialize_container_empty( target_storage_directory / "container.json" ) assert expected_container_information_json_path.exists() - assert container_information_json_path == expected_container_information_json_path assert container_information.id == "mock_container_empty_id" assert container_information.population is None diff --git a/tests/unit/serialize/test_dataframe_information.py b/tests/unit/serialize/test_dataframe_information.py deleted file mode 100644 index 7143f83..0000000 --- a/tests/unit/serialize/test_dataframe_information.py +++ /dev/null @@ -1,71 +0,0 @@ -from pathlib import Path - -import pytest - -from getml_io.metadata.dataframe_information import DataFrameInformation -from getml_io.metadata.exception import ( - DataFrameInformationPathNotRelativeError, -) -from getml_io.serialize.dataframe_information import ( - derive_instance_with_relative_path, - derive_instances_with_relative_path, -) - - -@pytest.mark.unit -def test_derive_instance_with_relative_path( - tmp_path: Path, - dataframe_information: DataFrameInformation, -) -> None: - # Given - - # When - derived_instance = derive_instance_with_relative_path( - dataframe_information, - tmp_path, - ) - - # Then - assert isinstance(derived_instance, DataFrameInformation) - assert derived_instance.path == Path(dataframe_information.path.name) - - -@pytest.mark.unit -def test_derive_instance_with_relative_path_not_relative( - dataframe_information: DataFrameInformation, -) -> None: - # Given - non_relative_path = Path("/non/relative/path") - - # When / Then - with pytest.raises( - DataFrameInformationPathNotRelativeError, - match=( - r"'.*Information' with name '.*_name' " - f"and path '{dataframe_information.path}' " - r"is not relative to base path '/non/relative/path'." - ), - ): - _ = derive_instance_with_relative_path( - dataframe_information, - non_relative_path, - ) - - -@pytest.mark.unit -def test_derive_instances_with_relative_path( - tmp_path: Path, - dataframe_information: DataFrameInformation, -) -> None: - # When - derived_instances_by_name = derive_instances_with_relative_path( - {dataframe_information.name: dataframe_information}, - tmp_path, - ) - - # Then - assert list(derived_instances_by_name.keys()) == [dataframe_information.name] - - derived_instance = derived_instances_by_name[dataframe_information.name] - assert isinstance(derived_instance, DataFrameInformation) - assert derived_instance.path == Path(dataframe_information.path.name) diff --git a/tests/unit/serialize/test_dataframe_or_view.py b/tests/unit/serialize/test_dataframe_or_view.py index 92a01c0..57f398c 100644 --- a/tests/unit/serialize/test_dataframe_or_view.py +++ b/tests/unit/serialize/test_dataframe_or_view.py @@ -12,7 +12,7 @@ ) from getml_io.utils.exception import StorageDirectoryCreationError from tests.unit.conftest import ( - MockDuckDBExecuteFactory, + MockDuckDBSQLFactory, column_information_by_name_to_json, get_expected_column_information_by_name, ) @@ -23,33 +23,32 @@ def test_serialize_dataframe_or_view( tmp_path: Path, mocker: pytest_mock.MockerFixture, mock_dataframe: DataFrame, - mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, ) -> None: # Given - mock_duckdb_execute_factory( - { - Path(f"dataframes/{mock_dataframe.name}.parquet"): mock_dataframe, - }, + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], ) target_storage_directory = tmp_path / "dataframes" mock_dataframe.to_parquet = mocker.Mock() # When - serialized_info = serialize_dataframe_or_view( - mock_dataframe, - target_storage_directory, + serialized_information = serialize_dataframe_or_view( + dataframe_or_view=mock_dataframe, + target_storage_directory=target_storage_directory, ) # Then expected_parquet_path = target_storage_directory / "mock_dataframe_name.parquet" - assert serialized_info.name == "mock_dataframe_name" - assert serialized_info.path == expected_parquet_path + assert serialized_information.name == mock_dataframe.name mock_dataframe.to_parquet.assert_called_once_with(str(expected_parquet_path)) expected_column_information_by_name = get_expected_column_information_by_name() assert column_information_by_name_to_json( - serialized_info.columns, + serialized_information.columns, ) == column_information_by_name_to_json(expected_column_information_by_name) @@ -66,8 +65,8 @@ def test_serialize_dataframe_or_view_directory_creation_error( match=r"Failed to create storage directory '/invalid/dataframes'.", ): _ = serialize_dataframe_or_view( - mock_dataframe, - invalid_target_storage_directory, + dataframe_or_view=mock_dataframe, + target_storage_directory=invalid_target_storage_directory, ) @@ -84,6 +83,39 @@ def test_serialize_dataframe_or_view_storage_error( # When / Then with pytest.raises( DataFrameParquetStorageError, - match=r"Failed to store DataFrame as parquet 'mock_dataframe_name' at path", + match=rf"Failed to store DataFrame as parquet '{mock_dataframe.name}' at path", ): - _ = serialize_dataframe_or_view(mock_dataframe, target_storage_directory) + _ = serialize_dataframe_or_view( + dataframe_or_view=mock_dataframe, + target_storage_directory=target_storage_directory, + ) + + +@pytest.mark.unit +def test_serialize_dataframe_or_view_without_storage_directory( + mocker: pytest_mock.MockerFixture, + mock_dataframe: DataFrame, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, +) -> None: + # Given + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], + ) + + mock_dataframe.to_parquet = mocker.Mock() + + # When + serialized_information = serialize_dataframe_or_view( + dataframe_or_view=mock_dataframe, + ) + + # Then + assert serialized_information.name == mock_dataframe.name + mock_dataframe.to_parquet.assert_not_called() + + expected_column_information_by_name = get_expected_column_information_by_name() + assert column_information_by_name_to_json( + serialized_information.columns, + ) == column_information_by_name_to_json(expected_column_information_by_name) diff --git a/tests/unit/serialize/test_parquet.py b/tests/unit/serialize/test_parquet.py index 7130169..806b1bf 100644 --- a/tests/unit/serialize/test_parquet.py +++ b/tests/unit/serialize/test_parquet.py @@ -1,155 +1,45 @@ from pathlib import Path -import getml.data.roles as getml_roles import pytest from getml.data import DataFrame -from getml.data.roles.types import Role as GetMLRole -from getml_io.getml.roles import Role -from getml_io.metadata.dataframe_information import ( - ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING, - ColumnStatistics, - ColumnStatisticsCategorical, - ColumnStatisticsJoinKey, - ColumnStatisticsNumerical, - ColumnStatisticsTarget, - ColumnStatisticsText, - ColumnStatisticsTimeStamp, - ColumnStatisticsTimeStampAsFloat, - ColumnStatisticsUnusedFloat, - ColumnStatisticsUnusedString, - ColumnType, -) -from getml_io.serialize.exception import ( - UnsupportedColumnStatisticsError, -) from getml_io.serialize.parquet import ( - _get_column_statistics_type, # pyright: ignore[reportPrivateUsage] - serialize_dataframe, + serialize_object, ) -from getml_io.serialize.roles import serialize_role from getml_io.utils.convert import assume_is_str from tests.unit.conftest import ( - MockDuckDBExecuteFactory, + MockDuckDBSQLFactory, column_information_by_name_to_json, get_expected_column_information_by_name, ) @pytest.mark.unit -@pytest.mark.parametrize( - ("getml_role", "column_type"), - [ - (getml_roles.categorical, ColumnType.DOUBLE), - (getml_roles.categorical, ColumnType.TIMESTAMP_NS), - (getml_roles.join_key, ColumnType.DOUBLE), - (getml_roles.join_key, ColumnType.TIMESTAMP_NS), - (getml_roles.text, ColumnType.DOUBLE), - (getml_roles.text, ColumnType.TIMESTAMP_NS), - (getml_roles.unused_string, ColumnType.DOUBLE), - (getml_roles.unused_string, ColumnType.TIMESTAMP_NS), - (getml_roles.numerical, ColumnType.VARCHAR), - (getml_roles.numerical, ColumnType.TIMESTAMP_NS), - (getml_roles.target, ColumnType.VARCHAR), - (getml_roles.target, ColumnType.TIMESTAMP_NS), - (getml_roles.time_stamp, ColumnType.VARCHAR), - (getml_roles.unused_float, ColumnType.VARCHAR), - (getml_roles.unused_float, ColumnType.TIMESTAMP_NS), - ], -) -def test__get_column_statistics_type_error( - getml_role: GetMLRole, - column_type: ColumnType, -) -> None: - # Given - column_name = "TestColumn" - dataframe_name = "mock_dataframe_name" - role = serialize_role(getml_role) - - # When / Then - with pytest.raises( - UnsupportedColumnStatisticsError, - match=( - rf"Column '{column_name}' in dataframe '{dataframe_name}' " - rf"has an unsupported role: {role!r} and type: {column_type.value}" - ), - ): - _ = _get_column_statistics_type( - dataframe_name, - column_name, - getml_role, - column_type.value, - ) - - assert (role, column_type) not in ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING - - -@pytest.mark.unit -@pytest.mark.parametrize( - ("getml_role", "column_type", "expected_column_statistics_type"), - [ - (Role.CATEGORICAL, ColumnType.VARCHAR, ColumnStatisticsCategorical), - (Role.JOIN_KEY, ColumnType.VARCHAR, ColumnStatisticsJoinKey), - (Role.NUMERICAL, ColumnType.DOUBLE, ColumnStatisticsNumerical), - (Role.TARGET, ColumnType.DOUBLE, ColumnStatisticsTarget), - (Role.TIME_STAMP, ColumnType.TIMESTAMP_NS, ColumnStatisticsTimeStamp), - (Role.TIME_STAMP, ColumnType.DOUBLE, ColumnStatisticsTimeStampAsFloat), - (Role.TEXT, ColumnType.VARCHAR, ColumnStatisticsText), - (Role.UNUSED_FLOAT, ColumnType.DOUBLE, ColumnStatisticsUnusedFloat), - (Role.UNUSED_STRING, ColumnType.VARCHAR, ColumnStatisticsUnusedString), - ], -) -def test__get_column_statistics_type( - getml_role: GetMLRole, - column_type: ColumnType, - expected_column_statistics_type: type[ColumnStatistics], -) -> None: - # Given - column_name = "TestColumn" - dataframe_name = "mock_dataframe_name" - role = serialize_role(getml_role) - - # When - column_statistics_type = _get_column_statistics_type( - dataframe_name, - column_name, - getml_role, - column_type.value, - ) - - # Then - assert column_statistics_type is expected_column_statistics_type - assert (role, column_type) in ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING - - -@pytest.mark.unit -def test_serialize_dataframe( +def test_serialize_object( tmp_path: Path, mock_dataframe: DataFrame, - mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, ) -> None: # Given - mock_duckdb_execute_factory( - { - Path(f"dataframes/{mock_dataframe.name}.parquet"): mock_dataframe, - }, + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], ) target_storage_directory = tmp_path / "dataframes" # When - serialized_info = serialize_dataframe( + serialized_info = serialize_object( target_storage_directory=target_storage_directory, save_parquet=(lambda _: None), - dataframe_name=assume_is_str(mock_dataframe.name), + object_name=assume_is_str(mock_dataframe.name), get_getml_role_by_column=mock_dataframe.roles.column, column_names=mock_dataframe.columns, ) # Then - expected_parquet_path = target_storage_directory / "mock_dataframe_name.parquet" - assert serialized_info.name == "mock_dataframe_name" - assert serialized_info.path == expected_parquet_path + assert serialized_info.name == mock_dataframe.name expected_column_information_by_name = get_expected_column_information_by_name() assert column_information_by_name_to_json( diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index f4c0750..1249378 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -49,67 +49,60 @@ from getml_io.metadata.pipeline_information import LossFunction from getml_io.serialize.exception import WrongPipelineScoreTypeError from getml_io.serialize.pipeline import ( - serialize_all_metadata, - serialize_columns, - serialize_feature_learner, + convert_all_metadata, + convert_columns, + convert_feature_learner, + convert_features, + convert_predictor, + convert_preprocessor, + convert_scores, + convert_tables, serialize_feature_sets, - serialize_features, serialize_pipeline, serialize_predictions, - serialize_predictor, - serialize_preprocessor, - serialize_scores, - serialize_tables, ) -from tests.unit.conftest import MockDuckDBExecuteFactory +from tests.unit.conftest import MockDuckDBSQLFactory @pytest.mark.unit -def test_serialize_pipeline( # noqa: PLR0913 +def test_serialize_pipeline( tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, - mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, mock_dataframe: DataFrame, - mock_getml_dataframe_from_array: None, - feature_sets_path: Path, - predictions_path: Path, ) -> None: # Given - _ = mock_getml_dataframe_from_array - mock_duckdb_execute_factory( - { - feature_sets_path / "features.train.parquet": mock_dataframe, - feature_sets_path / "features.test.parquet": mock_dataframe, - feature_sets_path / "features.validation.parquet": mock_dataframe, - predictions_path / "prediction.train.parquet": mock_dataframe, - predictions_path / "prediction.test.parquet": mock_dataframe, - predictions_path / "prediction.validation.parquet": mock_dataframe, - }, + mock_duckdb_sql_factory( + [ + mock_dataframe, + mock_dataframe, + mock_dataframe, + mock_dataframe, + mock_dataframe, + mock_dataframe, + ], ) target_storage_directory = tmp_path # When - pipeline_information_json_path, pipeline_information = serialize_pipeline( - mock_pipeline, - mock_container, - target_storage_directory, + pipeline_information = serialize_pipeline( + pipeline=mock_pipeline, + container=mock_container, + target_storage_directory=target_storage_directory, ) # Then assert pipeline_information.id == mock_pipeline.id expected_pipeline_information_json_path = target_storage_directory / "pipeline.json" - assert pipeline_information_json_path == expected_pipeline_information_json_path - - pipeline_path = Path("pipeline") + assert expected_pipeline_information_json_path.exists() # Then - feature sets for subset in ["train", "test", "validation"]: _assert_features_valid( pipeline_information.feature_sets, subset, - pipeline_path / "feature_sets", ) # Then - predictions @@ -117,7 +110,6 @@ def test_serialize_pipeline( # noqa: PLR0913 _assert_predictions_valid( pipeline_information.predictions, subset, - pipeline_path / "predictions", ) assert len(pipeline_information.feature_learners) == 1 @@ -165,17 +157,17 @@ def test_serialize_pipeline_with_empty_outputs( target_storage_directory = tmp_path # When - pipeline_information_json_path, pipeline_information = serialize_pipeline( - mock_pipeline, - mock_container_empty, - target_storage_directory, + pipeline_information = serialize_pipeline( + pipeline=mock_pipeline, + container=mock_container_empty, + target_storage_directory=target_storage_directory, ) # Then assert pipeline_information.id == mock_pipeline.id expected_pipeline_information_json_path = target_storage_directory / "pipeline.json" - assert pipeline_information_json_path == expected_pipeline_information_json_path + assert expected_pipeline_information_json_path.exists() # Then - feature sets assert len(pipeline_information.feature_sets) == 0 @@ -185,29 +177,28 @@ def test_serialize_pipeline_with_empty_outputs( @pytest.mark.unit -def test_serialize_feature_sets( # noqa: PLR0913 +def test_serialize_feature_sets( tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, - mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, mock_dataframe: DataFrame, - feature_sets_path: Path, ) -> None: # Given - mock_duckdb_execute_factory( - { - feature_sets_path / "features.train.parquet": mock_dataframe, - feature_sets_path / "features.test.parquet": mock_dataframe, - feature_sets_path / "features.validation.parquet": mock_dataframe, - }, + mock_duckdb_sql_factory( + [ + mock_dataframe, + mock_dataframe, + mock_dataframe, + ], ) target_storage_directory = tmp_path / "pipeline" # When feature_sets = serialize_feature_sets( - mock_pipeline, - mock_container, - target_storage_directory, + pipeline=mock_pipeline, + container=mock_container, + target_storage_directory=target_storage_directory, ) # Then @@ -218,36 +209,32 @@ def test_serialize_feature_sets( # noqa: PLR0913 _assert_features_valid( feature_sets, subset, - target_storage_directory / "feature_sets", ) @pytest.mark.unit -def test_serialize_predictions( # noqa: PLR0913 +def test_serialize_predictions( tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, - mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, mock_dataframe: DataFrame, - mock_getml_dataframe_from_array: None, - predictions_path: Path, ) -> None: # Given - _ = mock_getml_dataframe_from_array - mock_duckdb_execute_factory( - { - predictions_path / "prediction.train.parquet": mock_dataframe, - predictions_path / "prediction.test.parquet": mock_dataframe, - predictions_path / "prediction.validation.parquet": mock_dataframe, - }, + mock_duckdb_sql_factory( + [ + mock_dataframe, + mock_dataframe, + mock_dataframe, + ], ) target_storage_directory = tmp_path / "pipeline" # When prediction_results = serialize_predictions( - mock_pipeline, - mock_container, - target_storage_directory, + pipeline=mock_pipeline, + container=mock_container, + target_storage_directory=target_storage_directory, ) # Then @@ -258,33 +245,24 @@ def test_serialize_predictions( # noqa: PLR0913 _assert_predictions_valid( prediction_results, subset, - target_storage_directory / "predictions", ) def _assert_predictions_valid( predictions: DataFrameInformationByName, subset_name: str, - path: Path, ) -> None: assert predictions[subset_name] assert predictions[subset_name].name == f"prediction.{subset_name}" - expected_path = path / f"prediction.{subset_name}.parquet" - assert predictions[subset_name].path == expected_path - def _assert_features_valid( features: DataFrameInformationByName, subset_name: str, - path: Path, ) -> None: assert features[subset_name] assert features[subset_name].name == f"features.{subset_name}" - expected_path = path / f"features.{subset_name}.parquet" - assert features[subset_name].path == expected_path - @pytest.mark.unit @pytest.mark.parametrize( @@ -297,7 +275,7 @@ def _assert_features_valid( (getml_feature_learner.RelMT(), RelMT), ], ) -def test_serialize_feature_learner( +def test_convert_feature_learner( feature_learner: getml_feature_learner.Fastboost | getml_feature_learner.FastProp | getml_feature_learner.Multirel @@ -306,10 +284,10 @@ def test_serialize_feature_learner( expected_result_type: type[FeatureLearner], ) -> None: # When - serialized_feature_learner = serialize_feature_learner(feature_learner) + converted_feature_learner = convert_feature_learner(feature_learner) # Then - assert isinstance(serialized_feature_learner, expected_result_type) + assert isinstance(converted_feature_learner, expected_result_type) @pytest.mark.unit @@ -324,7 +302,7 @@ def test_serialize_feature_learner( (getml_predictor.XGBoostRegressor(), XGBoostRegressor), ], ) -def test_serialize_predictor( +def test_convert_predictor( predictor: getml_predictor.LinearRegression | getml_predictor.LogisticRegression | getml_predictor.ScaleGBMClassifier @@ -334,10 +312,10 @@ def test_serialize_predictor( expected_result_type: type[Predictor], ) -> None: # When - serialized_predictor = serialize_predictor(predictor) + converted_predictor = convert_predictor(predictor) # Then - assert isinstance(serialized_predictor, expected_result_type) + assert isinstance(converted_predictor, expected_result_type) @pytest.mark.unit @@ -353,7 +331,7 @@ def test_serialize_predictor( (getml_preprocessor.TextFieldSplitter(), TextFieldSplitter), ], ) -def test_serialize_preprocessor( +def test_convert_preprocessor( preprocessor: getml_preprocessor.CategoryTrimmer | getml_preprocessor.EmailDomain | getml_preprocessor.Imputation @@ -364,34 +342,34 @@ def test_serialize_preprocessor( expected_result_type: type[Preprocessor], ) -> None: # When - serialized_preprocessor = serialize_preprocessor(preprocessor) + converted_preprocessor = convert_preprocessor(preprocessor) # Then - assert isinstance(serialized_preprocessor, expected_result_type) + assert isinstance(converted_preprocessor, expected_result_type) @pytest.mark.unit -def test_serialize_features(mock_features: GetMLFeatures) -> None: +def test_convert_features(mock_features: GetMLFeatures) -> None: # Given # When - features = serialize_features(mock_features) + converted_features = convert_features(mock_features) # Then - assert len(features) == 1 - feature = features["test_feature"] + assert len(converted_features) == 1 + feature = converted_features["test_feature"] assert feature.name == "test_feature" assert feature.index == 0 @pytest.mark.unit -def test_serialize_scores_regression(mock_scores_regression: GetMLScores) -> None: +def test_convert_scores_regression(mock_scores_regression: GetMLScores) -> None: # Given # When - scores = serialize_scores(mock_scores_regression) + converted_scores = convert_scores(mock_scores_regression) # Then - assert len(scores) == 1 - score = scores[0] + assert len(converted_scores) == 1 + score = converted_scores[0] assert isinstance(score, RegressionScore) assert score.type == "regression" assert score.date_time == datetime(2023, 1, 1, 12, 0, 0, tzinfo=timezone.utc) @@ -403,7 +381,7 @@ def test_serialize_scores_regression(mock_scores_regression: GetMLScores) -> Non @pytest.mark.unit -def test_serialize_scores_regression_wrong_type( +def test_convert_scores_regression_wrong_type( mock_scores_classification: GetMLScores, mock_scores_regression: GetMLScores, ) -> None: @@ -412,20 +390,20 @@ def test_serialize_scores_regression_wrong_type( # When / Then with pytest.raises(WrongPipelineScoreTypeError): - _ = serialize_scores(mock_scores_regression) + _ = convert_scores(mock_scores_regression) @pytest.mark.unit -def test_serialize_scores_classification( +def test_convert_scores_classification( mock_scores_classification: GetMLScores, ) -> None: # Given # When - scores = serialize_scores(mock_scores_classification) + converted_scores = convert_scores(mock_scores_classification) # Then - assert len(scores) == 1 - score = scores[0] + assert len(converted_scores) == 1 + score = converted_scores[0] assert isinstance(score, ClassificationScore) assert score.type == "classification" assert score.date_time == datetime(2023, 1, 1, 12, 0, 0, tzinfo=timezone.utc) @@ -437,7 +415,7 @@ def test_serialize_scores_classification( @pytest.mark.unit -def test_serialize_scores_classification_wrong_type( +def test_convert_scores_classification_wrong_type( mock_scores_classification: GetMLScores, mock_scores_regression: GetMLScores, ) -> None: @@ -446,19 +424,19 @@ def test_serialize_scores_classification_wrong_type( # When / Then with pytest.raises(WrongPipelineScoreTypeError): - _ = serialize_scores(mock_scores_classification) + _ = convert_scores(mock_scores_classification) @pytest.mark.unit -def test_serialize_columns(mock_columns: GetMLColumns) -> None: +def test_convert_columns(mock_columns: GetMLColumns) -> None: # Given # When - columns = serialize_columns(mock_columns) + converted_columns = convert_columns(mock_columns) # Then - assert len(columns) == 1 - assert columns[0] == Column( + assert len(converted_columns) == 1 + assert converted_columns[0] == Column( index=0, name="target0", marker="test_marker_0", @@ -469,58 +447,58 @@ def test_serialize_columns(mock_columns: GetMLColumns) -> None: @pytest.mark.unit -def test_serialize_columns_empty() -> None: +def test_convert_columns_empty() -> None: # Given # When - columns = serialize_columns(None) + converted_columns = convert_columns(None) # Then - assert len(columns) == 0 + assert len(converted_columns) == 0 @pytest.mark.unit -def test_serialize_metadata( +def test_convert_metadata( getml_all_metadata: GetMLAllMetadata, roles_empty: Roles, ) -> None: # Given # When - pipeline_metadata = serialize_all_metadata(getml_all_metadata) + converted_pipeline_metadata = convert_all_metadata(getml_all_metadata) # Then - assert pipeline_metadata.population == DataFrameMetaData( + assert converted_pipeline_metadata.population == DataFrameMetaData( name="placeholder_population", roles=roles_empty, ) - assert pipeline_metadata.peripheral == [ + assert converted_pipeline_metadata.peripheral == [ DataFrameMetaData(name="placeholder_peripheral", roles=roles_empty), ] @pytest.mark.unit -def test_serialize_metadata_empty() -> None: +def test_convert_metadata_empty() -> None: # Given # When - pipeline_metadata = serialize_all_metadata(None) + converted_pipeline_metadata = convert_all_metadata(None) # Then - assert pipeline_metadata.population is None - assert pipeline_metadata.peripheral == [] + assert converted_pipeline_metadata.population is None + assert converted_pipeline_metadata.peripheral == [] @pytest.mark.unit -def test_serialize_tables(mock_tables: GetMLTables) -> None: +def test_convert_tables(mock_tables: GetMLTables) -> None: # Given # When - tables = serialize_tables(mock_tables) + converted_tables = convert_tables(mock_tables) # Then - assert len(tables) == 1 - assert tables[0] == Table( + assert len(converted_tables) == 1 + assert converted_tables[0] == Table( name="test_table", marker="test_marker", target="target0", diff --git a/tests/unit/serialize/test_project.py b/tests/unit/serialize/test_project.py index 0114f64..9e40893 100644 --- a/tests/unit/serialize/test_project.py +++ b/tests/unit/serialize/test_project.py @@ -4,7 +4,7 @@ import pytest_mock from getml_io.getml.project import Project -from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.project_identification import ProjectIdentification from getml_io.metadata.container_information import ContainerInformation from getml_io.metadata.pipeline_information import PipelineInformation from getml_io.serialize.project import serialize_project @@ -14,7 +14,7 @@ def test_serialize_project( # noqa: PLR0913 mocker: pytest_mock.MockerFixture, tmp_path: Path, - project_information: ProjectInformation, + project_identification: ProjectIdentification, container_information_empty: ContainerInformation, pipeline_information_empty: PipelineInformation, mock_project_empty: Project, @@ -31,18 +31,18 @@ def test_serialize_project( # noqa: PLR0913 ) mock_serialize_container = mocker.patch( "getml_io.serialize.project.serialize_container", - return_value=(tmp_path, container_information_empty), + return_value=container_information_empty, ) mock_serialize_pipeline = mocker.patch( "getml_io.serialize.project.serialize_pipeline", - return_value=(tmp_path, pipeline_information_empty), + return_value=pipeline_information_empty, ) # When - serialize_project( - project_information, - root_storage_directory, + project_information = serialize_project( + project_identification=project_identification, + root_storage_directory=root_storage_directory, clear_storage_directory=True, ) @@ -51,3 +51,6 @@ def test_serialize_project( # noqa: PLR0913 mock_load_project.assert_called() mock_serialize_container.assert_called() mock_serialize_pipeline.assert_called() + + assert project_information.container_information == container_information_empty + assert project_information.pipeline_information == pipeline_information_empty diff --git a/tests/unit/serialize/test_table.py b/tests/unit/serialize/test_table.py new file mode 100644 index 0000000..34c5ce3 --- /dev/null +++ b/tests/unit/serialize/test_table.py @@ -0,0 +1,88 @@ +from pathlib import Path + +import pyarrow as pa +import pytest +from getml.data import DataFrame + +from getml_io.serialize.table import ( + serialize_table, # pyright: ignore [reportUnknownVariableType] +) +from getml_io.utils.convert import assume_is_str +from tests.unit.conftest import ( + MockDuckDBSQLFactory, + column_information_by_name_to_json, + get_expected_column_information_by_name, +) + + +@pytest.mark.unit +def test_serialize_table( + tmp_path: Path, + mock_dataframe: DataFrame, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, +) -> None: + # Given + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], + ) + table = pa.Table.from_pydict( # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + { + "mocked": [], + }, + ) + + # When + dataframe_information = serialize_table( + table=table, # pyright: ignore[reportUnknownArgumentType] + table_name=assume_is_str(mock_dataframe.name), + get_getml_role_by_column=mock_dataframe.roles.column, + column_names=mock_dataframe.columns, + target_storage_directory=tmp_path, + filename_prefix="prefix_", + ) + + # Then + assert dataframe_information.name == mock_dataframe.name + expected_column_information_by_name = get_expected_column_information_by_name() + assert column_information_by_name_to_json( + dataframe_information.columns, + ) == column_information_by_name_to_json(expected_column_information_by_name) + + parquet_filepath = tmp_path / f"prefix_.{mock_dataframe.name}.parquet" + assert parquet_filepath.exists() + assert parquet_filepath.is_file() + + +@pytest.mark.unit +def test_serialize_table_without_storage_directory( + mock_dataframe: DataFrame, + mock_duckdb_sql_factory: MockDuckDBSQLFactory, +) -> None: + # Given + mock_duckdb_sql_factory( + [ + mock_dataframe, + ], + ) + table = pa.Table.from_pydict( # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] + { + "mocked": [], + }, + ) + + # When + dataframe_information = serialize_table( + table=table, # pyright: ignore[reportUnknownArgumentType] + table_name=assume_is_str(mock_dataframe.name), + get_getml_role_by_column=mock_dataframe.roles.column, + column_names=mock_dataframe.columns, + ) + + # Then + assert dataframe_information.name == mock_dataframe.name + expected_column_information_by_name = get_expected_column_information_by_name() + assert column_information_by_name_to_json( + dataframe_information.columns, + ) == column_information_by_name_to_json(expected_column_information_by_name) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 3094a32..90e9243 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -6,7 +6,7 @@ from typer.testing import CliRunner from getml_io.cli import DEFAULT_ROOT_STORAGE_DIRECTORY, LogLevel, app -from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.project_identification import ProjectIdentification @pytest.mark.unit @@ -20,7 +20,7 @@ def test_log_level() -> None: @pytest.mark.unit def test_serialize( mocker: pytest_mock.MockerFixture, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given mock_serialize_project = mocker.patch( @@ -38,19 +38,19 @@ def test_serialize( [ "serialize", "--project", - project_information.project_name, + project_identification.project_name, "--pipeline", - project_information.pipeline_id, + project_identification.pipeline_id, "--container", - project_information.container_id, + project_identification.container_id, ], ) # Then assert result.exit_code == 0 mock_serialize_project.assert_called_with( - project_information, - DEFAULT_ROOT_STORAGE_DIRECTORY, + project_identification=project_identification, + root_storage_directory=DEFAULT_ROOT_STORAGE_DIRECTORY, clear_storage_directory=False, ) assert mock_logging_basic_config.call_args.kwargs["level"] == logging.WARNING # pyright: ignore [reportAny] @@ -60,7 +60,7 @@ def test_serialize( def test_serialize_with_clear_storage_directory( mocker: pytest_mock.MockerFixture, tmp_path: Path, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given mock_serialize_project = mocker.patch( @@ -81,11 +81,11 @@ def test_serialize_with_clear_storage_directory( "INFO", "serialize", "--project", - project_information.project_name, + project_identification.project_name, "--pipeline", - project_information.pipeline_id, + project_identification.pipeline_id, "--container", - project_information.container_id, + project_identification.container_id, "--root-storage-directory", str(root_storage_directory), "--clear-storage-directory", @@ -95,8 +95,8 @@ def test_serialize_with_clear_storage_directory( # Then assert result.exit_code == 0 mock_serialize_project.assert_called_with( - project_information, - root_storage_directory, + project_identification=project_identification, + root_storage_directory=root_storage_directory, clear_storage_directory=True, ) assert mock_logging_basic_config.call_args.kwargs["level"] == logging.INFO # pyright: ignore [reportAny] @@ -126,4 +126,4 @@ def test_deserialize_not_implemented() -> None: # Then assert result.exit_code == 1 assert type(result.exception) is NotImplementedError - assert "Deserializing ProjectInformation" in result.exception.args[0] + assert "Deserializing ProjectIdentification" in result.exception.args[0] diff --git a/tests/unit/utils/test_storage.py b/tests/unit/utils/test_storage.py index 190afbd..5268f8e 100644 --- a/tests/unit/utils/test_storage.py +++ b/tests/unit/utils/test_storage.py @@ -8,7 +8,7 @@ from platformdirs.unix import Unix from platformdirs.windows import Windows -from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.project_identification import ProjectIdentification from getml_io.utils.exception import ( StorageDirectoryCreationError, StorageDirectoryRemovalError, @@ -23,7 +23,7 @@ @pytest.mark.unit def test_create_target_storage_directory_not_existing_directory( tmp_path: Path, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given root_storage_directory = tmp_path @@ -33,7 +33,7 @@ def test_create_target_storage_directory_not_existing_directory( # When target_storage_directory = create_target_storage_directory( - project_information, + project_identification, root_storage_directory, clear_storage_directory=False, ) @@ -47,7 +47,7 @@ def test_create_target_storage_directory_not_existing_directory( @pytest.mark.unit def test_create_target_storage_directory_empty_directory( tmp_path: Path, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given root_storage_directory = tmp_path @@ -58,7 +58,7 @@ def test_create_target_storage_directory_empty_directory( # When target_storage_directory = create_target_storage_directory( - project_information, + project_identification, root_storage_directory, clear_storage_directory=False, ) @@ -72,7 +72,7 @@ def test_create_target_storage_directory_empty_directory( @pytest.mark.unit def test_create_target_storage_directory_dirty_directory( tmp_path: Path, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given root_storage_directory = tmp_path @@ -84,7 +84,7 @@ def test_create_target_storage_directory_dirty_directory( # When target_storage_directory = create_target_storage_directory( - project_information, + project_identification, root_storage_directory, clear_storage_directory=False, ) @@ -98,7 +98,7 @@ def test_create_target_storage_directory_dirty_directory( @pytest.mark.unit def test_create_target_storage_directory_clear_dirty_directory( tmp_path: Path, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given root_storage_directory = tmp_path @@ -110,7 +110,7 @@ def test_create_target_storage_directory_clear_dirty_directory( # When target_storage_directory = create_target_storage_directory( - project_information, + project_identification, root_storage_directory, clear_storage_directory=True, ) @@ -125,7 +125,7 @@ def test_create_target_storage_directory_clear_dirty_directory( def test_create_target_storage_directory_directory_does_not_exist_creation_error( tmp_path: Path, mocker: pytest_mock.MockerFixture, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given mock_mkdir = mocker.patch.object(Path, "mkdir", side_effect=FileNotFoundError) @@ -136,7 +136,7 @@ def test_create_target_storage_directory_directory_does_not_exist_creation_error match=r"Failed to create storage directory", ): _ = create_target_storage_directory( - project_information, + project_identification, tmp_path, clear_storage_directory=False, ) @@ -148,10 +148,10 @@ def test_create_target_storage_directory_directory_does_not_exist_creation_error def test_create_target_storage_directory_directory_clearing_error( tmp_path: Path, mocker: pytest_mock.MockerFixture, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given - target_storage_path = get_target_storage_directory(project_information, tmp_path) + target_storage_path = get_target_storage_directory(project_identification, tmp_path) target_storage_path.mkdir(parents=True, exist_ok=True) (target_storage_path / "some_file.txt").touch() mock_rmtree = mocker.patch.object(shutil, "rmtree", side_effect=Exception) @@ -162,7 +162,7 @@ def test_create_target_storage_directory_directory_clearing_error( match=r"Failed to remove storage directory", ): _ = create_target_storage_directory( - project_information, + project_identification, tmp_path, clear_storage_directory=True, ) @@ -174,10 +174,10 @@ def test_create_target_storage_directory_directory_clearing_error( def test_create_target_storage_directory_after_clearing_existing_directory_error( tmp_path: Path, mocker: pytest_mock.MockerFixture, - project_information: ProjectInformation, + project_identification: ProjectIdentification, ) -> None: # Given - target_storage_path = get_target_storage_directory(project_information, tmp_path) + target_storage_path = get_target_storage_directory(project_identification, tmp_path) target_storage_path.mkdir(parents=True, exist_ok=True) (target_storage_path / "some_file.txt").touch() mock_mkdir = mocker.patch.object(Path, "mkdir", side_effect=FileExistsError) @@ -188,7 +188,7 @@ def test_create_target_storage_directory_after_clearing_existing_directory_error match=r"Failed to create storage directory", ): _ = create_target_storage_directory( - project_information, + project_identification, tmp_path, clear_storage_directory=True, )