diff --git a/src/getml_io/getml/feature_learning.py b/src/getml_io/getml/feature_learning.py new file mode 100644 index 0000000..f45d113 --- /dev/null +++ b/src/getml_io/getml/feature_learning.py @@ -0,0 +1,123 @@ +from __future__ import annotations + +from collections.abc import Set as AbstractSet +from typing import Annotated, Literal + +from getml.feature_learning.aggregations.types import ( + FastPropAggregations, + MultirelAggregations, +) +from getml.feature_learning.loss_functions import ( + CrossEntropyLossType, + SquareLossType, +) +from pydantic import Field +from pydantic.dataclasses import dataclass + + +@dataclass(frozen=True) +class FastProp: + aggregation: AbstractSet[FastPropAggregations] + delta_t: float + loss_function: CrossEntropyLossType | SquareLossType | None + max_lag: int + min_df: int + n_most_frequent: int + num_features: int + num_threads: int + sampling_factor: float + silent: bool + vocab_size: int + type: Literal["fast_prop"] = "fast_prop" + + +@dataclass(frozen=True) +class Fastboost: + gamma: float + loss_function: CrossEntropyLossType | SquareLossType | None + max_depth: int + min_child_weights: float + num_features: int + num_threads: int + reg_lambda: float + seed: int + shrinkage: float + silent: bool + subsample: float + type: Literal["fastboost"] = "fastboost" + + +@dataclass(frozen=True) +class Multirel: + aggregation: AbstractSet[MultirelAggregations] + allow_sets: bool + delta_t: float + grid_factor: float + loss_function: CrossEntropyLossType | SquareLossType | None + max_length: int + min_df: int + min_num_samples: int + num_features: int + num_subfeatures: int + num_threads: int + propositionalization: FastProp + regularization: float + round_robin: bool + sampling_factor: float + seed: int + share_aggregations: float + share_conditions: float + shrinkage: float + silent: bool + vocab_size: int + type: Literal["multirel"] = "multirel" + + +@dataclass(frozen=True) +class Relboost: + allow_null_weights: bool + delta_t: float + gamma: float + loss_function: CrossEntropyLossType | SquareLossType | None + max_depth: int + min_df: int + min_num_samples: int + num_features: int + num_subfeatures: int + num_threads: int + propositionalization: FastProp + reg_lambda: float + sampling_factor: float + seed: int + shrinkage: float + silent: bool + vocab_size: int + type: Literal["relboost"] = "relboost" + + +@dataclass(frozen=True) +class RelMT: + allow_avg: bool + delta_t: float + gamma: float + loss_function: CrossEntropyLossType | SquareLossType | None + max_depth: int + min_df: int + min_num_samples: int + num_features: int + num_subfeatures: int + num_threads: int + propositionalization: FastProp + reg_lambda: float + sampling_factor: float + seed: int + shrinkage: float + silent: bool + vocab_size: int + type: Literal["rel_mt"] = "rel_mt" + + +FeatureLearner = Annotated[ + FastProp | Fastboost | Multirel | Relboost | RelMT, + Field(discriminator="type"), +] diff --git a/src/getml_io/getml/predictors.py b/src/getml_io/getml/predictors.py new file mode 100644 index 0000000..662cf2b --- /dev/null +++ b/src/getml_io/getml/predictors.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +from typing import Annotated, Literal + +from pydantic import Field +from pydantic.dataclasses import dataclass + + +@dataclass(frozen=True) +class LinearRegression: + learning_rate: float + reg_lambda: float + type: Literal["linear_regression"] = "linear_regression" + + +@dataclass(frozen=True) +class LogisticRegression: + learning_rate: float + reg_lambda: float + type: Literal["logistic_regression"] = "logistic_regression" + + +@dataclass(frozen=True) +class ScaleGBMClassifier: + colsample_bylevel: float + colsample_bytree: float + early_stopping_rounds: int + gamma: float + goss_a: float + goss_b: float + learning_rate: float + max_depth: int + min_child_weights: float + n_estimators: int + n_jobs: int + objective: Literal["binary:logistic"] + reg_lambda: float + seed: int + type: Literal["scale_gbm_classifier"] = "scale_gbm_classifier" + + +@dataclass(frozen=True) +class ScaleGBMRegressor: + colsample_bylevel: float + colsample_bytree: float + early_stopping_rounds: int + gamma: float + goss_a: float + goss_b: float + learning_rate: float + max_depth: int + min_child_weights: float + n_estimators: int + n_jobs: int + objective: Literal["reg:squarederror"] + reg_lambda: float + seed: int + type: Literal["scale_gbm_regressor"] = "scale_gbm_regressor" + + +@dataclass(frozen=True) +class XGBoostClassifier: + booster: str + colsample_bylevel: float + colsample_bytree: float + early_stopping_rounds: int + gamma: float + learning_rate: float + max_delta_step: float + max_depth: int + min_child_weights: float + n_estimators: int + external_memory: bool + normalize_type: str + num_parallel_tree: int + n_jobs: int + objective: Literal["reg:logistic", "binary:logistic", "binary:logitraw"] + one_drop: bool + rate_drop: float + reg_alpha: float + reg_lambda: float + sample_type: str + silent: bool + skip_drop: float + subsample: float + type: Literal["xgboost_classifier"] = "xgboost_classifier" + + +@dataclass(frozen=True) +class XGBoostRegressor: + booster: str + colsample_bylevel: float + colsample_bytree: float + early_stopping_rounds: int + external_memory: bool + gamma: float + learning_rate: float + max_delta_step: float + max_depth: int + min_child_weights: float + n_estimators: int + normalize_type: str + num_parallel_tree: int + n_jobs: int + objective: Literal["reg:squarederror", "reg:tweedie", "reg:linear"] + one_drop: bool + rate_drop: float + reg_alpha: float + reg_lambda: float + sample_type: str + silent: bool + skip_drop: float + subsample: float + type: Literal["xgboost_regressor"] = "xgboost_regressor" + + +FeatureSelector = Annotated[ + LinearRegression + | LogisticRegression + | ScaleGBMRegressor + | ScaleGBMClassifier + | XGBoostRegressor + | XGBoostClassifier, + Field(discriminator="type"), +] +Predictor = FeatureSelector diff --git a/src/getml_io/getml/preprocessors.py b/src/getml_io/getml/preprocessors.py new file mode 100644 index 0000000..09db575 --- /dev/null +++ b/src/getml_io/getml/preprocessors.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from collections.abc import Set as AbstractSet +from typing import Annotated, Literal + +from getml.feature_learning.aggregations.types import MappingAggregations +from pydantic import Field +from pydantic.dataclasses import dataclass + + +@dataclass(frozen=True) +class CategoryTrimmer: + max_num_categories: int + min_freq: int + type: Literal["category_trimmer"] = "category_trimmer" + + +@dataclass(frozen=True) +class EmailDomain: + type: Literal["email_domain"] = "email_domain" + + +@dataclass(frozen=True) +class Imputation: + add_dummies: bool + type: Literal["imputation"] = "imputation" + + +@dataclass(frozen=True) +class Mapping: + aggregation: AbstractSet[MappingAggregations] + min_freq: int + multithreading: bool + type: Literal["mapping"] = "mapping" + + +@dataclass(frozen=True) +class Seasonal: + disable_year: bool + disable_month: bool + disable_weekday: bool + disable_hour: bool + disable_minute: bool + type: Literal["seasonal"] = "seasonal" + + +@dataclass(frozen=True) +class Substring: + begin: int + length: int + unit: str + type: Literal["substring"] = "substring" + + +@dataclass(frozen=True) +class TextFieldSplitter: + type: Literal["text_field_splitter"] = "text_field_splitter" + + +Preprocessor = Annotated[ + CategoryTrimmer + | EmailDomain + | Imputation + | Mapping + | Seasonal + | Substring + | TextFieldSplitter, + Field(discriminator="type"), +] diff --git a/src/getml_io/getml/relationships.py b/src/getml_io/getml/relationships.py new file mode 100644 index 0000000..d34aedd --- /dev/null +++ b/src/getml_io/getml/relationships.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from enum import Enum + +from getml.data import relationship as getml_relationship + + +class Relationship(str, Enum): + MANY_TO_MANY = getml_relationship.many_to_many + MANY_TO_ONE = getml_relationship.many_to_one + ONE_TO_MANY = getml_relationship.one_to_many + ONE_TO_ONE = getml_relationship.one_to_one + PROPOSITIONALIZATION = getml_relationship.propositionalization diff --git a/src/getml_io/getml/roles.py b/src/getml_io/getml/roles.py new file mode 100644 index 0000000..44d6bce --- /dev/null +++ b/src/getml_io/getml/roles.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from collections.abc import Sequence +from enum import Enum + +from getml.data import roles +from pydantic.dataclasses import dataclass + + +class Role(str, Enum): + CATEGORICAL = roles.categorical + JOIN_KEY = roles.join_key + NUMERICAL = roles.numerical + TARGET = roles.target + TEXT = roles.text + TIME_STAMP = roles.time_stamp + UNUSED_FLOAT = roles.unused_float + UNUSED_STRING = roles.unused_string + + +@dataclass(frozen=True) +class Roles: + categorical: Sequence[str] + join_key: Sequence[str] + numerical: Sequence[str] + target: Sequence[str] + text: Sequence[str] + time_stamp: Sequence[str] + unused_float: Sequence[str] + unused_string: Sequence[str] diff --git a/src/getml_io/metadata/container_information.py b/src/getml_io/metadata/container_information.py index d9e4b9d..441431b 100644 --- a/src/getml_io/metadata/container_information.py +++ b/src/getml_io/metadata/container_information.py @@ -30,6 +30,7 @@ class ContainerInformation: deep_copy: bool path: Annotated[Path, Field(exclude=True)] + # TODO @urfoex: #53 Adjust relative path already in serialization function @model_serializer() def _serialize_model( self, diff --git a/src/getml_io/metadata/data_model_information.py b/src/getml_io/metadata/data_model_information.py new file mode 100644 index 0000000..25ef133 --- /dev/null +++ b/src/getml_io/metadata/data_model_information.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from collections.abc import Mapping, Sequence + +from pydantic.dataclasses import dataclass + +from getml_io.metadata.placeholder_information import PlaceholderInformation + + +@dataclass(frozen=True) +class DataModelInformation: + population: PlaceholderInformation + peripheral: Mapping[str, Sequence[PlaceholderInformation]] diff --git a/src/getml_io/metadata/dataframe_information.py b/src/getml_io/metadata/dataframe_information.py index b01bc41..ba25820 100644 --- a/src/getml_io/metadata/dataframe_information.py +++ b/src/getml_io/metadata/dataframe_information.py @@ -2,26 +2,13 @@ from collections.abc import Mapping from datetime import datetime -from enum import Enum from pathlib import Path -from typing import Annotated, Literal, get_args +from typing import Annotated, Literal -import getml.data.roles.types as roles from pydantic import Field, TypeAdapter from pydantic.dataclasses import dataclass -from getml_io.utils.convert import assume_is_str - - -class Role(str, Enum): - CATEGORICAL = assume_is_str(get_args(roles.Categorical)[0]) - JOIN_KEY = assume_is_str(get_args(roles.JoinKey)[0]) - NUMERICAL = assume_is_str(get_args(roles.Numerical)[0]) - TARGET = assume_is_str(get_args(roles.Target)[0]) - TEXT = assume_is_str(get_args(roles.Text)[0]) - TIME_STAMP = assume_is_str(get_args(roles.TimeStamp)[0]) - UNUSED_FLOAT = assume_is_str(get_args(roles.UnusedFloat)[0]) - UNUSED_STRING = assume_is_str(get_args(roles.UnusedString)[0]) +from getml_io.getml.roles import Role @dataclass(frozen=True) @@ -123,7 +110,7 @@ class ColumnProfile: class DataFrameInformation: name: str path: Path - profile: Mapping[str, ColumnProfile] + column_profile: Mapping[str, ColumnProfile] ROLE_TYPE_ADAPTER_MAPPING = { diff --git a/src/getml_io/metadata/feature_sets.py b/src/getml_io/metadata/feature_sets.py index c337225..cb315e2 100644 --- a/src/getml_io/metadata/feature_sets.py +++ b/src/getml_io/metadata/feature_sets.py @@ -1,5 +1,6 @@ +from collections.abc import Mapping from typing import TypeAlias from getml_io.metadata.dataframe_information import DataFrameInformation -FeatureSets: TypeAlias = dict[str, DataFrameInformation] +FeatureSets: TypeAlias = Mapping[str, DataFrameInformation] diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py index 4e0fe18..903d38d 100644 --- a/src/getml_io/metadata/pipeline_information.py +++ b/src/getml_io/metadata/pipeline_information.py @@ -1,23 +1,62 @@ +from __future__ import annotations + +from collections.abc import Sequence +from enum import Enum from pathlib import Path from typing import Annotated -from pydantic import Field, model_serializer +from getml.feature_learning.loss_functions import ( + CROSSENTROPYLOSS, + SQUARELOSS, +) +from pydantic import ( + Field, + model_serializer, +) from pydantic.dataclasses import dataclass from typing_extensions import TypedDict +from getml_io.getml.feature_learning import FeatureLearner +from getml_io.getml.predictors import FeatureSelector, Predictor +from getml_io.getml.preprocessors import Preprocessor +from getml_io.metadata.data_model_information import DataModelInformation from getml_io.metadata.exception import ( DataFrameInformationPathNotRelativeError, TableInformationPathNotRelativeError, ) from getml_io.metadata.feature_sets import FeatureSets +from getml_io.metadata.placeholder_information import PlaceholderInformation from getml_io.metadata.prediction_results import PredictionResults from getml_io.metadata.utils import derive_instance_with_relative_path +class LossFunction(str, Enum): + CROSS_ENTROPY_LOSS = CROSSENTROPYLOSS + SQUARE_LOSS = SQUARELOSS + + class PipelineInformationDict(TypedDict): id: str predictions: PredictionResults feature_sets: FeatureSets + feature_learners: Sequence[FeatureLearner] + feature_selectors: Sequence[FeatureSelector] + include_categorical: bool + is_classification: bool + is_regression: bool + loss_function: LossFunction + peripheral: Sequence[PlaceholderInformation] + predictors: Sequence[Predictor] + preprocessors: Sequence[Preprocessor] + share_selected_features: float + tags: Sequence[str] + targets: Sequence[str] + data_model: DataModelInformation + # features # TODO @urfoex: #17 + # scores # TODO @urfoex: #18 + # columns # TODO @urfoex: #50 + # metadata # TODO @urfoex: #51 + # tables # TODO @urfoex: #52 @dataclass @@ -26,31 +65,59 @@ class PipelineInformation: predictions: PredictionResults feature_sets: FeatureSets path: Annotated[Path, Field(exclude=True)] + feature_learners: Sequence[FeatureLearner] + feature_selectors: Sequence[FeatureSelector] + include_categorical: bool + is_classification: bool + is_regression: bool + loss_function: LossFunction + peripheral: Sequence[PlaceholderInformation] + predictors: Sequence[Predictor] + preprocessors: Sequence[Preprocessor] + share_selected_features: float + tags: Sequence[str] + targets: Sequence[str] + data_model: DataModelInformation + # features # TODO @urfoex: #17 + # scores # TODO @urfoex: #18 + # columns # TODO @urfoex: #50 + # metadata # TODO @urfoex: #51 + # tables # TODO @urfoex: #52 + # TODO @urfoex: #53 Adjust relative path already in serialization function @model_serializer() def _serialize_model(self) -> PipelineInformationDict: return PipelineInformationDict( { "id": self.id, - "predictions": PredictionResults( - { - name: derive_instance_with_relative_path( - table_information, - self.path, - TableInformationPathNotRelativeError, - ) - for name, table_information in self.predictions.items() - }, - ), - "feature_sets": FeatureSets( - { - name: derive_instance_with_relative_path( - dataframe_information, - self.path, - DataFrameInformationPathNotRelativeError, - ) - for name, dataframe_information in self.feature_sets.items() - }, - ), + "predictions": { + name: derive_instance_with_relative_path( + table_information, + self.path, + TableInformationPathNotRelativeError, + ) + for name, table_information in self.predictions.items() + }, + "feature_sets": { + name: derive_instance_with_relative_path( + dataframe_information, + self.path, + DataFrameInformationPathNotRelativeError, + ) + for name, dataframe_information in self.feature_sets.items() + }, + "feature_learners": self.feature_learners, + "feature_selectors": self.feature_selectors, + "include_categorical": self.include_categorical, + "is_classification": self.is_classification, + "is_regression": self.is_regression, + "loss_function": self.loss_function, + "peripheral": self.peripheral, + "predictors": self.predictors, + "preprocessors": self.preprocessors, + "share_selected_features": self.share_selected_features, + "tags": self.tags, + "targets": self.targets, + "data_model": self.data_model, }, ) diff --git a/src/getml_io/metadata/placeholder_information.py b/src/getml_io/metadata/placeholder_information.py new file mode 100644 index 0000000..32fea54 --- /dev/null +++ b/src/getml_io/metadata/placeholder_information.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from collections.abc import Sequence + +from pydantic.dataclasses import dataclass + +from getml_io.getml.relationships import Relationship +from getml_io.getml.roles import Roles + + +@dataclass(frozen=True) +class JoinInformation: + right: PlaceholderInformation + on: Sequence[tuple[str, str]] | Sequence[tuple[None, None]] + time_stamps: str | tuple[str, str] | None + upper_time_stamp: str | None + relationship: Relationship | None + memory: float | None + horizon: float | None + lagged_targets: bool | None + + +@dataclass(frozen=True) +class PlaceholderInformation: + name: str + roles: Roles + joins: Sequence[JoinInformation] + parent: str | None diff --git a/src/getml_io/metadata/prediction_results.py b/src/getml_io/metadata/prediction_results.py index 43d6e9f..1391c46 100644 --- a/src/getml_io/metadata/prediction_results.py +++ b/src/getml_io/metadata/prediction_results.py @@ -1,5 +1,6 @@ +from collections.abc import Mapping from typing import TypeAlias from getml_io.metadata.table_information import TableInformation -PredictionResults: TypeAlias = dict[str, TableInformation] +PredictionResults: TypeAlias = Mapping[str, TableInformation] diff --git a/src/getml_io/metadata/utils.py b/src/getml_io/metadata/utils.py index ff55e6d..9f23967 100644 --- a/src/getml_io/metadata/utils.py +++ b/src/getml_io/metadata/utils.py @@ -35,10 +35,17 @@ def derive_instance_with_relative_path( PathNotRelativeError: If the instance's path cannot be made relative to the base path. The specific subclass raised is determined by the `error_factory`. + TypeError: If the instance is not a dataclass. """ + if not dataclasses.is_dataclass(instance): + message = f"Instance must be a dataclass: {type(instance)}" + raise TypeError(message) try: - return dataclasses.replace(instance, path=instance.path.relative_to(base_path)) + return dataclasses.replace( + instance, + path=instance.path.relative_to(base_path), + ) except Exception as exception: error = error_factory( instance.name, diff --git a/src/getml_io/serialize/data_model.py b/src/getml_io/serialize/data_model.py new file mode 100644 index 0000000..a2f0e23 --- /dev/null +++ b/src/getml_io/serialize/data_model.py @@ -0,0 +1,33 @@ +from typing import cast + +from getml.data import DataModel +from getml.data.placeholder import Placeholder + +from getml_io.metadata.data_model_information import ( + DataModelInformation, +) +from getml_io.serialize.placeholder import serialize_placeholder + + +def serialize_data_model(data_model: DataModel) -> DataModelInformation: + """Serialize a getML DataModel into a DataModelInformation object. + + Args: + data_model: The DataModel to serialize. + + Returns: + DataModelInformation: The serialized DataModel information. + + """ + return DataModelInformation( + population=serialize_placeholder( + data_model.population, + ), + peripheral={ + name: [serialize_placeholder(placeholder) for placeholder in placeholders] + for name, placeholders in cast( + "dict[str, list[Placeholder]]", + cast("object", data_model.peripheral), + ).items() + }, + ) diff --git a/src/getml_io/serialize/dataframe_or_view.py b/src/getml_io/serialize/dataframe_or_view.py index e885549..dfa7254 100644 --- a/src/getml_io/serialize/dataframe_or_view.py +++ b/src/getml_io/serialize/dataframe_or_view.py @@ -1,8 +1,10 @@ from __future__ import annotations import logging +from collections.abc import Mapping from logging import Logger from pathlib import Path +from typing import cast import duckdb from getml.data import ( @@ -11,6 +13,7 @@ ) from pydantic import TypeAdapter +from getml_io.getml.roles import Role from getml_io.metadata.dataframe_information import ( ROLE_TYPE_ADAPTER_MAPPING, ColumnProfile, @@ -24,12 +27,12 @@ ColumnStatisticsUnusedFloat, ColumnStatisticsUnusedString, DataFrameInformation, - Role, ) from getml_io.serialize.exception import ( DataFrameParquetStorageError, UnsupportedColumnStatisticsError, ) +from getml_io.serialize.roles import serialize_role from getml_io.utils.convert import assume_is_str from getml_io.utils.exception import StorageDirectoryCreationError @@ -83,16 +86,16 @@ def serialize_dataframe_or_view( parquet_filepath, ) from exception - profile = _calculate_profile(parquet_filepath, dataframe_or_view) + column_profile = _calculate_column_profile(parquet_filepath, dataframe_or_view) return DataFrameInformation( name=name, path=parquet_filepath, - profile=profile, + column_profile=column_profile, ) -def _calculate_profile( +def _calculate_column_profile( parquet_filepath: Path, dataframe_or_view: DataFrame | View, ) -> dict[str, ColumnProfile]: @@ -134,17 +137,24 @@ def _fetch_raw_summary_statistics( "Calculating summary statistics for Parquet '%s'", parquet_filepath, ) - return ( - connection.execute(SUMMARIZE_STATEMENT_TEMPLATE, [str(parquet_filepath)]) - .df() - .set_index("column_name") - .to_dict(orient="index") + return cast( + "dict[str, dict[str, str | int | float]]", + cast( + "object", + connection.execute( + SUMMARIZE_STATEMENT_TEMPLATE, + [str(parquet_filepath)], + ) + .df() + .set_index("column_name") + .to_dict(orient="index"), + ), ) def _build_column_statistics( dataframe_or_view: DataFrame | View, - raw_summary_statistics: dict[str, dict[str, str | int | float]], + raw_summary_statistics: Mapping[str, Mapping[str, str | int | float]], ) -> dict[str, ColumnStatistics]: return { name: _get_column_statistics_adapter( @@ -172,7 +182,7 @@ def _get_column_statistics_adapter( | TypeAdapter[ColumnStatisticsUnusedFloat] | TypeAdapter[ColumnStatisticsUnusedString] ): - role = Role(assume_is_str(dataframe_or_view.roles.column(name))) + role = serialize_role(dataframe_or_view.roles.column(name)) adapter = ROLE_TYPE_ADAPTER_MAPPING.get((role, column_type)) if adapter is None: raise UnsupportedColumnStatisticsError( diff --git a/src/getml_io/serialize/exception.py b/src/getml_io/serialize/exception.py index ecae607..dc0737c 100644 --- a/src/getml_io/serialize/exception.py +++ b/src/getml_io/serialize/exception.py @@ -1,6 +1,7 @@ from pathlib import Path -from getml_io.metadata.dataframe_information import ROLE_TYPE_ADAPTER_MAPPING, Role +from getml_io.getml.roles import Role +from getml_io.metadata.dataframe_information import ROLE_TYPE_ADAPTER_MAPPING from getml_io.utils.exception import GetMLIOError diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index 3af9ab3..11af6e0 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -1,21 +1,58 @@ +import dataclasses from pathlib import Path from typing import cast import numpy as np +from getml import feature_learning as getml_feature_learner +from getml import predictors as getml_predictor +from getml import preprocessors as getml_preprocessor from getml.data import ( Container, DataFrame, ) from getml.pipeline import Pipeline from numpy.typing import NDArray +from pydantic import TypeAdapter +from getml_io.getml.feature_learning import ( + Fastboost, + FastProp, + FeatureLearner, + Multirel, + Relboost, + RelMT, +) +from getml_io.getml.predictors import ( + LinearRegression, + LogisticRegression, + Predictor, + ScaleGBMClassifier, + ScaleGBMRegressor, + XGBoostClassifier, + XGBoostRegressor, +) +from getml_io.getml.preprocessors import ( + CategoryTrimmer, + EmailDomain, + Imputation, + Mapping, + Preprocessor, + Seasonal, + Substring, + TextFieldSplitter, +) from getml_io.metadata.feature_sets import FeatureSets -from getml_io.metadata.pipeline_information import PipelineInformation +from getml_io.metadata.pipeline_information import ( + LossFunction, + PipelineInformation, +) from getml_io.metadata.prediction_results import PredictionResults from getml_io.metadata.table_information import TableInformation +from getml_io.serialize.data_model import serialize_data_model from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view from getml_io.serialize.ndarray import serialize_ndarray from getml_io.serialize.pipeline_information import serialize_pipeline_information +from getml_io.serialize.placeholder import serialize_placeholder from getml_io.utils.convert import ( assume_is_dict_str_to_dataframe_or_view, ) @@ -54,6 +91,37 @@ def serialize_pipeline( target_storage_directory=pipeline_storage_directory, ), path=target_storage_directory, + feature_learners=[ + serialize_feature_learner(feature_learner) + for feature_learner in pipeline.feature_learners + ], + feature_selectors=[ + serialize_predictor(feature_selector) + for feature_selector in pipeline.feature_selectors + ], + include_categorical=pipeline.include_categorical, + is_classification=pipeline.is_classification, + is_regression=pipeline.is_regression, + loss_function=LossFunction(pipeline.loss_function), + peripheral=[ + serialize_placeholder(placeholder) for placeholder in pipeline.peripheral + ], + predictors=[ + serialize_predictor(predictor) for predictor in pipeline.predictors + ], + preprocessors=[ + serialize_preprocessor(preprocessor) + for preprocessor in pipeline.preprocessors + ], + share_selected_features=pipeline.share_selected_features, + tags=pipeline.tags, + targets=pipeline.targets, + data_model=serialize_data_model(pipeline.data_model), + # features # TODO @urfoex: #17 + # scores # TODO @urfoex: #18 + # columns # TODO @urfoex: #50 + # metadata # TODO @urfoex: #51 + # tables # TODO @urfoex: #52 ) pipeline_information_json_path = serialize_pipeline_information( pipeline_information=pipeline_information, @@ -81,9 +149,10 @@ def serialize_predictions( """ predict_storage_directory = target_storage_directory / "predictions" - prediction_results = PredictionResults() + prediction_results: PredictionResults = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): prediction = pipeline.predict(container[subset_name]) + # TODO @urfoex: #54 Convert NDArray to DataFrame and use dataframe serialization path = serialize_ndarray( array=cast("NDArray[np.float64]", prediction), target_storage_directory=predict_storage_directory, @@ -116,7 +185,7 @@ def serialize_feature_sets( """ transform_storage_directory = target_storage_directory / "feature_sets" - feature_sets = FeatureSets() + feature_sets: FeatureSets = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): features = pipeline.transform( container[subset_name], @@ -129,3 +198,102 @@ def serialize_feature_sets( feature_sets[subset_name] = dataframe_information return feature_sets + + +def serialize_feature_learner( + feature_learner: getml_feature_learner.Fastboost + | getml_feature_learner.FastProp + | getml_feature_learner.Multirel + | getml_feature_learner.Relboost + | getml_feature_learner.RelMT, +) -> FeatureLearner: + """Serialize a getML FeatureLearner into a FeatureLearner object. + + Args: + feature_learner: The FeatureLearner to serialize. + + Returns: + FeatureLearner: The serialized FeatureLearner information. + + """ + feature_learner_as_dict = dataclasses.asdict(feature_learner) + match feature_learner: + case getml_feature_learner.Fastboost(): + return TypeAdapter(Fastboost).validate_python(feature_learner_as_dict) + case getml_feature_learner.FastProp(): + return TypeAdapter(FastProp).validate_python(feature_learner_as_dict) + case getml_feature_learner.Multirel(): + return TypeAdapter(Multirel).validate_python(feature_learner_as_dict) + case getml_feature_learner.Relboost(): + return TypeAdapter(Relboost).validate_python(feature_learner_as_dict) + case getml_feature_learner.RelMT(): + return TypeAdapter(RelMT).validate_python(feature_learner_as_dict) + + +def serialize_predictor( + predictor: getml_predictor.LinearRegression + | getml_predictor.LogisticRegression + | getml_predictor.ScaleGBMClassifier + | getml_predictor.ScaleGBMRegressor + | getml_predictor.XGBoostClassifier + | getml_predictor.XGBoostRegressor, +) -> Predictor: + """Serialize a getML Predictor into a Predictor object. + + Args: + predictor: The Predictor to serialize. + + Returns: + Predictor: The serialized Predictor information. + + """ + predictor_as_dict = dataclasses.asdict(predictor) + match predictor: + case getml_predictor.LinearRegression(): + return TypeAdapter(LinearRegression).validate_python(predictor_as_dict) + case getml_predictor.LogisticRegression(): + return TypeAdapter(LogisticRegression).validate_python(predictor_as_dict) + case getml_predictor.ScaleGBMClassifier(): + return TypeAdapter(ScaleGBMClassifier).validate_python(predictor_as_dict) + case getml_predictor.ScaleGBMRegressor(): + return TypeAdapter(ScaleGBMRegressor).validate_python(predictor_as_dict) + case getml_predictor.XGBoostClassifier(): + return TypeAdapter(XGBoostClassifier).validate_python(predictor_as_dict) + case getml_predictor.XGBoostRegressor(): + return TypeAdapter(XGBoostRegressor).validate_python(predictor_as_dict) + + +def serialize_preprocessor( + preprocessor: getml_preprocessor.CategoryTrimmer + | getml_preprocessor.EmailDomain + | getml_preprocessor.Imputation + | getml_preprocessor.Mapping + | getml_preprocessor.Seasonal + | getml_preprocessor.Substring + | getml_preprocessor.TextFieldSplitter, +) -> Preprocessor: + """Serialize a getML Preprocessor into a Preprocessor object. + + Args: + preprocessor: The Preprocessor to serialize. + + Returns: + Preprocessor: The serialized Preprocessor information. + + """ + preprocessor_as_dict = dataclasses.asdict(preprocessor) + match preprocessor: + case getml_preprocessor.CategoryTrimmer(): + return TypeAdapter(CategoryTrimmer).validate_python(preprocessor_as_dict) + case getml_preprocessor.EmailDomain(): + return TypeAdapter(EmailDomain).validate_python(preprocessor_as_dict) + case getml_preprocessor.Imputation(): + return TypeAdapter(Imputation).validate_python(preprocessor_as_dict) + case getml_preprocessor.Mapping(): + return TypeAdapter(Mapping).validate_python(preprocessor_as_dict) + case getml_preprocessor.Seasonal(): + return TypeAdapter(Seasonal).validate_python(preprocessor_as_dict) + case getml_preprocessor.Substring(): + return TypeAdapter(Substring).validate_python(preprocessor_as_dict) + case getml_preprocessor.TextFieldSplitter(): + return TypeAdapter(TextFieldSplitter).validate_python(preprocessor_as_dict) diff --git a/src/getml_io/serialize/placeholder.py b/src/getml_io/serialize/placeholder.py new file mode 100644 index 0000000..456f037 --- /dev/null +++ b/src/getml_io/serialize/placeholder.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from getml.data import Placeholder + +from getml_io.getml.relationships import Relationship +from getml_io.metadata.placeholder_information import ( + JoinInformation, + PlaceholderInformation, +) +from getml_io.serialize.roles import serialize_roles + + +def serialize_placeholder(placeholder: Placeholder) -> PlaceholderInformation: + """Serialize a getML Placeholder into a PlaceholderInformation object. + + Args: + placeholder: The Placeholder to serialize. + + Returns: + PlaceholderInformation: The serialized Placeholder information. + + """ + return PlaceholderInformation( + name=placeholder.name, + roles=serialize_roles(placeholder.roles), + joins=[ + JoinInformation( + right=serialize_placeholder(join.right), + on=join.on, + time_stamps=join.time_stamps, + upper_time_stamp=join.upper_time_stamp, + relationship=Relationship(join.relationship), + memory=join.memory, + horizon=join.horizon, + lagged_targets=join.lagged_targets, + ) + for join in placeholder.joins + ], + parent=placeholder.parent.name if placeholder.parent else None, + ) diff --git a/src/getml_io/serialize/roles.py b/src/getml_io/serialize/roles.py new file mode 100644 index 0000000..2fc160c --- /dev/null +++ b/src/getml_io/serialize/roles.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from getml.data import Roles as GetMLRoles +from getml.data.roles.types import Role as GetMLRole + +from getml_io.getml.roles import Role, Roles +from getml_io.utils.convert import assume_is_str + + +def serialize_roles(roles: GetMLRoles) -> Roles: + """Serialize the roles of getML into a Roles object. + + Args: + roles: The Roles object to serialize. + + Returns: + Roles: The serialized roles information. + + """ + return Roles( + categorical=list(roles.categorical), + join_key=list(roles.join_key), + numerical=list(roles.numerical), + target=list(roles.target), + text=list(roles.text), + time_stamp=list(roles.time_stamp), + unused_float=list(roles.unused_float), + unused_string=list(roles.unused_string), + ) + + +def serialize_role(role: GetMLRole) -> Role: + """Serialize a getML Role into a Role object. + + Args: + role: The Role to serialize. + + Returns: + Role: The serialized role information. + + """ + return Role(assume_is_str(role)) diff --git a/tests/integration/assertions.py b/tests/integration/assertions.py index b07dea6..422d10d 100644 --- a/tests/integration/assertions.py +++ b/tests/integration/assertions.py @@ -1,3 +1,4 @@ +from collections.abc import Sequence from pathlib import Path from getml_io.metadata.container_information import ContainerInformation @@ -12,8 +13,8 @@ def assert_container_parquets( container_directory: Path, - expected_peripheral_parquets: list[str], - expected_subset_parquets: list[str], + expected_peripheral_parquets: Sequence[str], + expected_subset_parquets: Sequence[str], ) -> None: assert_files_in_directory( container_directory / "peripheral", @@ -28,8 +29,8 @@ def assert_container_parquets( def assert_pipeline_parquets( pipeline_directory: Path, - expected_feature_sets: list[str], - expected_predictions: list[str], + expected_feature_sets: Sequence[str], + expected_predictions: Sequence[str], ) -> None: assert_files_in_directory( pipeline_directory / "feature_sets", @@ -44,7 +45,7 @@ def assert_pipeline_parquets( def assert_files_in_directory( directory: Path, - expected_files: list[str], + expected_files: Sequence[str], ) -> None: assert directory.exists() assert directory.is_dir() @@ -103,12 +104,15 @@ def assert_dataframe_information( assert dataframe_information.path == expected_dataframe_information.path assert ( - dataframe_information.profile.keys() - == expected_dataframe_information.profile.keys() + dataframe_information.column_profile.keys() + == expected_dataframe_information.column_profile.keys() ) - for column_name, column_profile in expected_dataframe_information.profile.items(): + for ( + column_name, + column_profile, + ) in expected_dataframe_information.column_profile.items(): assert_column_profile( - dataframe_information.profile[column_name], + dataframe_information.column_profile[column_name], column_profile, ) @@ -169,6 +173,45 @@ def assert_pipeline_information( prediction, ) + assert ( + pipeline_information.feature_learners + == expected_pipeline_information.feature_learners + ) + assert ( + pipeline_information.feature_selectors + == expected_pipeline_information.feature_selectors + ) + assert ( + pipeline_information.include_categorical + == expected_pipeline_information.include_categorical + ) + + assert ( + pipeline_information.is_classification + == expected_pipeline_information.is_classification + ) + assert ( + pipeline_information.is_regression + == expected_pipeline_information.is_regression + ) + assert ( + pipeline_information.loss_function + == expected_pipeline_information.loss_function + ) + assert pipeline_information.peripheral == expected_pipeline_information.peripheral + assert pipeline_information.predictors == expected_pipeline_information.predictors + assert ( + pipeline_information.preprocessors + == expected_pipeline_information.preprocessors + ) + assert ( + pipeline_information.share_selected_features + == expected_pipeline_information.share_selected_features + ) + assert pipeline_information.tags + assert pipeline_information.targets == expected_pipeline_information.targets + assert pipeline_information.data_model == expected_pipeline_information.data_model + def assert_table_information( table_information: TableInformation, diff --git a/tests/integration/data/datasets.py b/tests/integration/data/datasets.py index 5e3852b..df3a3bf 100644 --- a/tests/integration/data/datasets.py +++ b/tests/integration/data/datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Mapping, Sequence from enum import Enum from pathlib import Path @@ -18,11 +19,11 @@ class DataSetName(str, Enum): @dataclass class DataSet: name: DataSetName - population: list[Path] - peripheral: list[Path] + population: Sequence[Path] + peripheral: Sequence[Path] -DATASETS: dict[DataSetName, DataSet] = { +DATASETS: Mapping[DataSetName, DataSet] = { DataSetName.ROBOT: DataSet( name=DataSetName.ROBOT, population=[ diff --git a/tests/integration/data/getmlproject.py b/tests/integration/data/getmlproject.py index 8958cf8..7d66ed6 100644 --- a/tests/integration/data/getmlproject.py +++ b/tests/integration/data/getmlproject.py @@ -2,8 +2,9 @@ import logging from abc import ABC, abstractmethod -from collections.abc import Callable, Generator +from collections.abc import Callable, Generator, Sequence from contextlib import contextmanager +from itertools import chain from pathlib import Path from typing import TypeVar @@ -88,7 +89,7 @@ def _is_bundle_older_than_dataset(self) -> bool: bundle_mtime = self._path.stat().st_mtime - for path in population_paths + peripheral_paths: + for path in chain(population_paths, peripheral_paths): if path.exists() and path.stat().st_mtime > bundle_mtime: return True @@ -130,7 +131,7 @@ def create( def _load_dataframes_from_parquet( self, - dataset_paths: list[Path], + dataset_paths: Sequence[Path], ) -> dict[str, DataFrame | View]: logger.info("Loading dataframes from parquet files %r...", dataset_paths) dataframes: dict[str, DataFrame | View] = {} diff --git a/tests/integration/data/loans/expected.container.json b/tests/integration/data/loans/expected.container.json index a312459..476d87c 100644 --- a/tests/integration/data/loans/expected.container.json +++ b/tests/integration/data/loans/expected.container.json @@ -5,7 +5,7 @@ "meta": { "name": "meta", "path": "container/peripheral/meta.parquet", - "profile": { + "column_profile": { "account_id": { "name": "account_id", "role": "join_key", @@ -421,7 +421,7 @@ "order": { "name": "order", "path": "container/peripheral/order.parquet", - "profile": { + "column_profile": { "account_id": { "name": "account_id", "role": "join_key", @@ -520,7 +520,7 @@ "trans": { "name": "trans", "path": "container/peripheral/trans.parquet", - "profile": { + "column_profile": { "date": { "name": "date", "role": "time_stamp", @@ -683,7 +683,7 @@ "train": { "name": "train", "path": "container/subsets/train.parquet", - "profile": { + "column_profile": { "date_loan": { "name": "date_loan", "role": "time_stamp", @@ -867,7 +867,7 @@ "test": { "name": "test", "path": "container/subsets/test.parquet", - "profile": { + "column_profile": { "date_loan": { "name": "date_loan", "role": "time_stamp", @@ -1050,4 +1050,4 @@ } }, "deep_copy": false -} \ No newline at end of file +} diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json index fc34029..e224a2b 100644 --- a/tests/integration/data/loans/expected.pipeline.json +++ b/tests/integration/data/loans/expected.pipeline.json @@ -1,5 +1,5 @@ { - "id": "3a4bkF", + "id": "n4ARIs", "predictions": { "train": { "name": "train", @@ -14,7 +14,7 @@ "train": { "name": "features.train", "path": "pipeline/feature_sets/features.train.parquet", - "profile": { + "column_profile": { "date_loan": { "name": "date_loan", "role": "time_stamp", @@ -27,7 +27,6 @@ "q25": "1995-05-25T08:40:00", "q50": "1997-01-05T09:18:22.040816", "q75": "1997-12-03T08:40:00", - "std": null, "null_percentage": 0.0, "column_type": "TIMESTAMP_NS", "type": "time_stamp" @@ -69,14 +68,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 42, - "avg": 760.5141612200435, + "approx_unique": 54, + "avg": 1963.1830065359477, "min": 0.0, - "max": 19621.0, - "q25": 0.0, - "q50": 0.0, - "q75": 202.7777777777778, - "std": 2501.0428875679136, + "max": 53991.0, + "q25": 145.83333333333334, + "q50": 569.3877551020407, + "q75": 900.0, + "std": 6658.363509588833, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -87,14 +86,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 17, - "avg": 882447.0588235294, + "approx_unique": 287, + "avg": 26940800.0, "min": 0.0, - "max": 68601600.0, - "q25": 0.0, - "q50": 0.0, - "q75": 0.0, - "std": 6212366.872861004, + "max": 42076800.0, + "q25": 16213200.0, + "q50": 30620865.30612245, + "q75": 39661200.0, + "std": 13345080.744145872, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -141,14 +140,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 34, - "avg": 7.1725490196078425, + "approx_unique": 128, + "avg": 72.32483660130725, "min": 0.0, - "max": 100.0, - "q25": 0.0, - "q50": 0.0, - "q75": 0.0, - "std": 22.558438138781643, + "max": 200.0, + "q25": 50.90277777777777, + "q50": 63.077551020408166, + "q75": 100.0, + "std": 43.688883621604056, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -231,14 +230,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 23, - "avg": 0.1993464052287582, + "approx_unique": 22, + "avg": 0.19324618736383445, "min": 0.0, "max": 6.6, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 0.7869983380593814, + "std": 0.7828247239515089, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -303,7 +302,7 @@ "test": { "name": "features.test", "path": "pipeline/feature_sets/features.test.parquet", - "profile": { + "column_profile": { "date_loan": { "name": "date_loan", "role": "time_stamp", @@ -316,7 +315,6 @@ "q25": "1995-11-23T07:20:00", "q50": "1997-04-28T16:00:00", "q75": "1998-01-15T20:00:00", - "std": null, "null_percentage": 0.0, "column_type": "TIMESTAMP_NS", "type": "time_stamp" @@ -358,14 +356,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 28, - "avg": 791.0224215246636, + "approx_unique": 30, + "avg": 1794.914798206278, "min": 0.0, - "max": 18200.0, - "q25": 0.0, - "q50": 0.0, - "q75": 120.41666666666667, - "std": 2808.7462985761595, + "max": 67376.0, + "q25": 200.0, + "q50": 466.6666666666667, + "q75": 900.0, + "std": 6927.7664625313855, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -376,14 +374,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 11, - "avg": 1550550.67264574, + "approx_unique": 147, + "avg": 27327971.30044843, "min": 0.0, - "max": 128390400.0, - "q25": 0.0, - "q50": 0.0, - "q75": 0.0, - "std": 10521062.635618178, + "max": 43804800.0, + "q25": 17558400.0, + "q50": 29433600.0, + "q75": 39708000.0, + "std": 12861334.000076162, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -430,14 +428,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 13, - "avg": 3.6977578475336323, + "approx_unique": 92, + "avg": 77.92331838565016, "min": 0.0, - "max": 100.0, - "q25": 0.0, - "q50": 0.0, - "q75": 0.0, - "std": 15.026950095889092, + "max": 200.0, + "q25": 52.963888888888896, + "q50": 70.53333333333333, + "q75": 100.0, + "std": 40.10804320182245, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -502,14 +500,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 7, - "avg": 0.2825112107623318, + "approx_unique": 6, + "avg": 0.2531390134529148, "min": 0.0, "max": 7.01, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 1.114162299388315, + "std": 1.0314587821754926, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -520,14 +518,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 17, - "avg": 0.20986547085201795, + "approx_unique": 16, + "avg": 0.19192825112107625, "min": 0.0, "max": 6.6, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 0.8666449130086721, + "std": 0.8325188926028172, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -589,5 +587,334 @@ } } } + }, + "feature_learners": [ + { + "aggregation": [ + "MIN", + "SUM", + "COUNT", + "MAX", + "AVG" + ], + "allow_sets": true, + "delta_t": 0.0, + "grid_factor": 1.0, + "loss_function": "CrossEntropyLoss", + "max_length": 4, + "min_df": 30, + "min_num_samples": 1, + "num_features": 10, + "num_subfeatures": 5, + "num_threads": 0, + "propositionalization": { + "aggregation": [ + "MIN", + "AVG", + "MODE", + "COUNT MINUS COUNT DISTINCT", + "MEDIAN", + "SUM", + "STDDEV", + "LAST", + "COUNT", + "MAX", + "FIRST", + "COUNT DISTINCT", + "TREND" + ], + "delta_t": 0.0, + "loss_function": "CrossEntropyLoss", + "max_lag": 0, + "min_df": 30, + "n_most_frequent": 0, + "num_features": 200, + "num_threads": 0, + "sampling_factor": 1.0, + "silent": true, + "vocab_size": 500, + "type": "fast_prop" + }, + "regularization": 0.01, + "round_robin": false, + "sampling_factor": 1.0, + "seed": 5543, + "share_aggregations": 0.0, + "share_conditions": 1.0, + "shrinkage": 0.0, + "silent": true, + "vocab_size": 500, + "type": "multirel" + } + ], + "feature_selectors": [], + "include_categorical": false, + "is_classification": true, + "is_regression": false, + "loss_function": "CrossEntropyLoss", + "peripheral": [ + { + "name": "meta", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": null + }, + { + "name": "order", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": null + }, + { + "name": "trans", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": null + } + ], + "predictors": [ + { + "booster": "gbtree", + "colsample_bylevel": 1.0, + "colsample_bytree": 1.0, + "early_stopping_rounds": 10, + "gamma": 0.0, + "learning_rate": 0.1, + "max_delta_step": 0.0, + "max_depth": 3, + "min_child_weights": 1.0, + "n_estimators": 100, + "external_memory": false, + "normalize_type": "tree", + "num_parallel_tree": 1, + "n_jobs": 1, + "objective": "binary:logistic", + "one_drop": false, + "rate_drop": 0.0, + "reg_alpha": 0.0, + "reg_lambda": 1.0, + "sample_type": "uniform", + "silent": true, + "skip_drop": 0.0, + "subsample": 1.0, + "type": "xgboost_classifier" + } + ], + "preprocessors": [], + "share_selected_features": 0.5, + "tags": [ + "container-loans" + ], + "targets": [ + "default" + ], + "data_model": { + "population": { + "name": "population", + "roles": { + "categorical": [ + "frequency" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "duration", + "payments", + "amount" + ], + "target": [ + "default" + ], + "text": [], + "time_stamp": [ + "date_loan" + ], + "unused_float": [ + "loan_id", + "district_id" + ], + "unused_string": [ + "date_account", + "status" + ] + }, + "joins": [ + { + "right": { + "name": "trans", + "roles": { + "categorical": [ + "type", + "k_symbol", + "bank", + "operation" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "amount", + "balance" + ], + "target": [], + "text": [], + "time_stamp": [ + "date" + ], + "unused_float": [ + "trans_id", + "account" + ], + "unused_string": [] + }, + "joins": [], + "parent": "population" + }, + "on": [ + [ + "account_id", + "account_id" + ] + ], + "time_stamps": [ + "date_loan", + "date" + ], + "upper_time_stamp": "", + "relationship": "many-to-many", + "memory": 0.0, + "horizon": 0.0, + "lagged_targets": false + }, + { + "right": { + "name": "order", + "roles": { + "categorical": [ + "bank_to", + "k_symbol" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "amount" + ], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [ + "account_to", + "order_id" + ], + "unused_string": [] + }, + "joins": [], + "parent": "population" + }, + "on": [ + [ + "account_id", + "account_id" + ] + ], + "time_stamps": null, + "upper_time_stamp": "", + "relationship": "many-to-many", + "memory": 0.0, + "horizon": 0.0, + "lagged_targets": false + }, + { + "right": { + "name": "meta", + "roles": { + "categorical": [ + "type_disp", + "type_card", + "gender", + "A3" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "A4", + "A5", + "A6", + "A7", + "A8", + "A9", + "A10", + "A11", + "A12", + "A13", + "A14", + "A15", + "A16" + ], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [ + "disp_id", + "client_id", + "card_id", + "district_id" + ], + "unused_string": [ + "issued", + "birth_date", + "A2" + ] + }, + "joins": [], + "parent": "population" + }, + "on": [ + [ + "account_id", + "account_id" + ] + ], + "time_stamps": null, + "upper_time_stamp": "", + "relationship": "many-to-many", + "memory": 0.0, + "horizon": 0.0, + "lagged_targets": false + } + ], + "parent": null + }, + "peripheral": {} } -} \ No newline at end of file +} diff --git a/tests/integration/data/numerical/expected.container.json b/tests/integration/data/numerical/expected.container.json index 970cb2a..1b4d6a6 100644 --- a/tests/integration/data/numerical/expected.container.json +++ b/tests/integration/data/numerical/expected.container.json @@ -5,7 +5,7 @@ "perph": { "name": "perph", "path": "container/peripheral/perph.parquet", - "profile": { + "column_profile": { "time_stamp": { "name": "time_stamp", "role": "time_stamp", @@ -62,7 +62,7 @@ "train": { "name": "train", "path": "container/subsets/train.parquet", - "profile": { + "column_profile": { "time_stamp": { "name": "time_stamp", "role": "time_stamp", @@ -135,7 +135,7 @@ "test": { "name": "test", "path": "container/subsets/test.parquet", - "profile": { + "column_profile": { "time_stamp": { "name": "time_stamp", "role": "time_stamp", @@ -207,4 +207,4 @@ } }, "deep_copy": false -} \ No newline at end of file +} diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json index 0379765..469e1a1 100644 --- a/tests/integration/data/numerical/expected.pipeline.json +++ b/tests/integration/data/numerical/expected.pipeline.json @@ -1,5 +1,5 @@ { - "id": "dzARR1", + "id": "RbdvzM", "predictions": { "train": { "name": "train", @@ -14,7 +14,7 @@ "train": { "name": "features.train", "path": "pipeline/feature_sets/features.train.parquet", - "profile": { + "column_profile": { "time_stamp": { "name": "time_stamp", "role": "time_stamp", @@ -27,7 +27,6 @@ "q25": "1970-01-01T00:00:00.271797", "q50": "1970-01-01T00:00:00.525929", "q75": "1970-01-01T00:00:00.755042", - "std": null, "null_percentage": 0.0, "column_type": "TIMESTAMP_NS", "type": "time_stamp" @@ -69,14 +68,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 409, - "avg": 96.16907668121702, + "approx_unique": 325, + "avg": 96.16907668123311, "min": 0.0, - "max": 154.958538372792, - "q25": 63.7858157217645, - "q50": 113.27765320314404, - "q75": 126.50463010411374, - "std": 40.831795142709815, + "max": 154.95853837280958, + "q25": 63.78581572178133, + "q50": 113.27765320315866, + "q75": 126.50463010412777, + "std": 40.83179514271343, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -87,14 +86,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 375, - "avg": 0.8376766309834969, - "min": -0.005741885599707359, - "max": 2.7194084335179305, - "q25": 0.652078199793918, - "q50": 0.9700795623321786, - "q75": 1.0064016977526749, - "std": 0.25570143177465654, + "approx_unique": 460, + "avg": 0.8376766309822631, + "min": -0.005741885778443031, + "max": 2.719408433645138, + "q25": 0.6520781997438738, + "q50": 0.9700795623721068, + "q75": 1.0064016977732944, + "std": 0.2557014318224649, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -105,14 +104,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 385, - "avg": 86.38617317829302, + "approx_unique": 539, + "avg": 86.38617317829309, "min": 0.0, - "max": 139.48311171572644, - "q25": 57.23467415456807, - "q50": 101.87912254707506, - "q75": 113.60643119883323, - "std": 36.712671335094534, + "max": 139.4831117157228, + "q25": 57.23467415456723, + "q50": 101.87912254707464, + "q75": 113.6064311988338, + "std": 36.712671335094896, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -123,14 +122,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 410, - "avg": 1.486671539444774, + "approx_unique": 348, + "avg": 1.4866715394762007, "min": 0.0, - "max": 18.07352556914617, - "q25": 0.5113719486389163, - "q50": 0.6982874132646059, - "q75": 1.482302195795544, - "std": 2.4958715677627152, + "max": 18.07352556921219, + "q25": 0.5113719486689973, + "q50": 0.6982874132797834, + "q75": 1.482302195832012, + "std": 2.49587156776478, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -141,14 +140,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 316, - "avg": 78.06387842451362, + "approx_unique": 302, + "avg": 78.0638784245176, "min": 0.0, - "max": 125.92644437182132, - "q25": 51.625347761997176, - "q50": 92.20745748713266, - "q75": 102.69796763001868, - "std": 33.211534248078024, + "max": 125.9264443718258, + "q25": 51.62534776200198, + "q50": 92.20745748713675, + "q75": 102.6979676300216, + "std": 33.2115342480785, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -159,14 +158,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 448, - "avg": 0.5609713545812214, - "min": -1.8843295948745304, - "max": 7.3863370740225225, - "q25": 0.3106364637147931, - "q50": 0.4926810309502056, - "q75": 0.589746691497219, - "std": 1.0748256879417633, + "approx_unique": 326, + "avg": 0.5609713545814653, + "min": -1.884329594891604, + "max": 7.386337074028212, + "q25": 0.31063646371654113, + "q50": 0.4926810309517353, + "q75": 0.5897466914989792, + "std": 1.074825687942176, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -177,14 +176,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 401, - "avg": 70.10412027365548, + "approx_unique": 468, + "avg": 70.10412027364201, "min": 0.0, - "max": 113.10321385166299, - "q25": 46.53178780372958, - "q50": 82.55886648685942, - "q75": 92.25996193856125, - "std": 29.748700279909087, + "max": 113.10321385165373, + "q25": 46.531787803716234, + "q50": 82.55886648683865, + "q75": 92.25996193853011, + "std": 29.748700279905826, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -195,14 +194,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 430, - "avg": 0.514033325846855, - "min": -0.4440243803830779, - "max": 0.949369395172385, - "q25": 0.40480853219332824, - "q50": 0.6491476769330858, - "q75": 0.6766508980468096, - "std": 0.2595060637322195, + "approx_unique": 375, + "avg": 0.5140333258914743, + "min": -0.44402437982294773, + "max": 0.9493693949050362, + "q25": 0.4048085322650249, + "q50": 0.6491476769270209, + "q75": 0.6766508980437053, + "std": 0.2595060636392321, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -213,14 +212,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 384, - "avg": 62.90321872321439, + "approx_unique": 378, + "avg": 62.90616871451977, "min": 0.0, - "max": 101.788803667452, - "q25": 41.77890864643118, - "q50": 74.08221392647076, - "q75": 82.57089164834606, - "std": 26.689815669012102, + "max": 101.78880366744104, + "q25": 41.77890864641669, + "q50": 74.08221392645343, + "q75": 82.5708916483322, + "std": 26.691806127961343, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -231,14 +230,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 361, - "avg": 0.4964582603881268, - "min": -0.453809371715282, - "max": 8.22465349938976, - "q25": -0.013087906353886864, - "q50": 0.16252383065828194, - "q75": 0.32139660116655816, - "std": 1.3656064778547599, + "approx_unique": 403, + "avg": 0.49645826037201785, + "min": -0.45380937173777053, + "max": 8.224653499349321, + "q25": -0.013087906368716035, + "q50": 0.16252383064546452, + "q75": 0.3213966011515323, + "std": 1.3656064778531225, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -267,7 +266,7 @@ "test": { "name": "features.test", "path": "pipeline/feature_sets/features.test.parquet", - "profile": { + "column_profile": { "time_stamp": { "name": "time_stamp", "role": "time_stamp", @@ -280,7 +279,6 @@ "q25": "1970-01-01T00:00:00.265168", "q50": "1970-01-01T00:00:00.526890", "q75": "1970-01-01T00:00:00.760226", - "std": null, "null_percentage": 0.0, "column_type": "TIMESTAMP_NS", "type": "time_stamp" @@ -322,14 +320,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 111, - "avg": 93.45267098578518, - "min": 1.0004581336993716, - "max": 150.31561443814144, - "q25": 61.9960947191257, - "q50": 110.66337045643346, - "q75": 124.18593897750968, - "std": 39.90174967625009, + "approx_unique": 116, + "avg": 93.45267098580021, + "min": 1.0004581336996778, + "max": 150.31561443815576, + "q25": 61.996094719141645, + "q50": 110.66337045645069, + "q75": 124.18593897749646, + "std": 39.90174967625321, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -340,14 +338,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 108, - "avg": 0.8371683464686371, - "min": 0.2616188362295636, - "max": 1.3008994591492, - "q25": 0.6463480308446998, - "q50": 0.976979499386938, - "q75": 0.9981102813038182, - "std": 0.22694451760592796, + "approx_unique": 115, + "avg": 0.8371683464674687, + "min": 0.26161883596933283, + "max": 1.3008994594133347, + "q25": 0.6463480307577066, + "q50": 0.9769794994204424, + "q75": 0.9981102813307521, + "std": 0.2269445176703966, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -358,14 +356,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 140, - "avg": 83.92568798146601, - "min": 0.8936319178549653, - "max": 135.34260543221865, - "q25": 55.618658113985575, - "q50": 99.41231630964873, - "q75": 112.1077854754618, - "std": 35.8615898850991, + "approx_unique": 118, + "avg": 83.92568798146631, + "min": 0.8936319178549497, + "max": 135.34260543221754, + "q25": 55.61865811398502, + "q50": 99.41231630964933, + "q75": 112.10778547546019, + "std": 35.861589885099654, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -376,14 +374,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 121, - "avg": 1.3729637341834586, - "min": 0.261503923197538, - "max": 14.741123450931518, - "q25": 0.5079183380724014, - "q50": 0.7330302592870825, - "q75": 1.5598762921374063, - "std": 2.1587383317017386, + "approx_unique": 97, + "avg": 1.3729637342160355, + "min": 0.26150392324676175, + "max": 14.741123450975595, + "q25": 0.5079183381010651, + "q50": 0.733030259298452, + "q75": 1.5598762921754463, + "std": 2.1587383317031876, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -394,14 +392,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 120, - "avg": 75.8412119711527, - "min": 0.8078674007375028, - "max": 122.03529180787704, - "q25": 50.168825830397346, - "q50": 90.3025019699042, - "q75": 101.01097835627549, - "std": 32.4378848933766, + "approx_unique": 88, + "avg": 75.84121197115688, + "min": 0.8078674007376023, + "max": 122.03529180787784, + "q25": 50.16882583040196, + "q50": 90.30250196991506, + "q75": 101.0109783562762, + "std": 32.43788489337715, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -412,14 +410,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 101, - "avg": 0.5049229013397559, - "min": -1.0234322123164012, - "max": 6.118363465758665, - "q25": 0.1428351156399801, - "q50": 0.4803652747349002, - "q75": 0.5681416543619112, - "std": 1.0283977404632196, + "approx_unique": 106, + "avg": 0.5049229013398149, + "min": -1.0234322123252753, + "max": 6.118363465755303, + "q25": 0.14283511563729268, + "q50": 0.4803652747350423, + "q75": 0.5681416543623228, + "std": 1.0283977404635392, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -430,14 +428,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 105, - "avg": 68.1177127554369, - "min": 0.7276733948503948, - "max": 109.66181307295999, - "q25": 45.19785585450378, - "q50": 80.4634801504877, - "q75": 90.96212075432149, - "std": 29.061616382307164, + "approx_unique": 112, + "avg": 68.11771275542432, + "min": 0.7276733948500856, + "max": 109.66181307290546, + "q25": 45.197855854491515, + "q50": 80.4634801504667, + "q75": 90.96212075430984, + "std": 29.061616382304035, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -448,14 +446,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 101, - "avg": 0.4979052918487878, - "min": -0.7685655011257438, - "max": 0.8965485722446492, - "q25": 0.31406675057102806, - "q50": 0.6465958189908928, - "q75": 0.6755108754778928, - "std": 0.27431090438821965, + "approx_unique": 109, + "avg": 0.4979052918970571, + "min": -0.7685655004747685, + "max": 0.8965485721217097, + "q25": 0.31406675065235196, + "q50": 0.6465958189771321, + "q75": 0.675510875478117, + "std": 0.27431090429004085, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -466,14 +464,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 125, - "avg": 61.12278600297129, - "min": 0.6534488975862575, - "max": 98.15463652421363, - "q25": 40.591311515103115, - "q50": 71.44444119850576, - "q75": 81.54953124372442, - "std": 26.07679480798724, + "approx_unique": 94, + "avg": 61.12278600295739, + "min": 0.6534488975859385, + "max": 98.15463652420176, + "q25": 40.59131151508852, + "q50": 71.44444119849118, + "q75": 81.54953124371184, + "std": 26.07679480798417, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -484,14 +482,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 113, - "avg": 0.3782221036599077, - "min": -0.9960609366948441, - "max": 6.97305844251124, - "q25": -0.02509066172728069, - "q50": 0.16454383872023082, - "q75": 0.30357637470021315, - "std": 1.150968935924188, + "approx_unique": 96, + "avg": 0.37822210364341335, + "min": -0.9960609367144815, + "max": 6.973058442481465, + "q25": -0.02509066174406189, + "q50": 0.16454383870471806, + "q75": 0.30357637468359877, + "std": 1.1509689359232387, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -517,5 +515,177 @@ } } } + }, + "feature_learners": [ + { + "allow_avg": true, + "delta_t": 0.0, + "gamma": 0.0, + "loss_function": "SquareLoss", + "max_depth": 2, + "min_df": 30, + "min_num_samples": 1, + "num_features": 10, + "num_subfeatures": 30, + "num_threads": 0, + "propositionalization": { + "aggregation": [ + "COUNT DISTINCT", + "STDDEV", + "COUNT", + "FIRST", + "COUNT MINUS COUNT DISTINCT", + "MEDIAN", + "AVG", + "MAX", + "LAST", + "MIN", + "SUM", + "MODE", + "TREND" + ], + "delta_t": 0.0, + "loss_function": "SquareLoss", + "max_lag": 0, + "min_df": 30, + "n_most_frequent": 0, + "num_features": 200, + "num_threads": 0, + "sampling_factor": 1.0, + "silent": true, + "vocab_size": 500, + "type": "fast_prop" + }, + "reg_lambda": 0.0, + "sampling_factor": 1.0, + "seed": 5543, + "shrinkage": 0.1, + "silent": true, + "vocab_size": 500, + "type": "rel_mt" + } + ], + "feature_selectors": [], + "include_categorical": false, + "is_classification": false, + "is_regression": true, + "loss_function": "SquareLoss", + "peripheral": [ + { + "name": "perph", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": null + } + ], + "predictors": [ + { + "booster": "gbtree", + "colsample_bylevel": 1.0, + "colsample_bytree": 1.0, + "early_stopping_rounds": 10, + "external_memory": false, + "gamma": 0.0, + "learning_rate": 0.1, + "max_delta_step": 0.0, + "max_depth": 3, + "min_child_weights": 1.0, + "n_estimators": 100, + "normalize_type": "tree", + "num_parallel_tree": 1, + "n_jobs": 1, + "objective": "reg:squarederror", + "one_drop": false, + "rate_drop": 0.0, + "reg_alpha": 0.0, + "reg_lambda": 1.0, + "sample_type": "uniform", + "silent": true, + "skip_drop": 0.0, + "subsample": 1.0, + "type": "xgboost_regressor" + } + ], + "preprocessors": [], + "share_selected_features": 0.5, + "tags": [ + "container-numerical" + ], + "targets": [ + "targets" + ], + "data_model": { + "population": { + "name": "population", + "roles": { + "categorical": [], + "join_key": [ + "join_key" + ], + "numerical": [ + "column_01" + ], + "target": [ + "targets" + ], + "text": [], + "time_stamp": [ + "time_stamp" + ], + "unused_float": [], + "unused_string": [] + }, + "joins": [ + { + "right": { + "name": "perph", + "roles": { + "categorical": [], + "join_key": [ + "join_key" + ], + "numerical": [ + "column_01" + ], + "target": [], + "text": [], + "time_stamp": [ + "time_stamp" + ], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": "population" + }, + "on": [ + [ + "join_key", + "join_key" + ] + ], + "time_stamps": [ + "time_stamp", + "time_stamp" + ], + "upper_time_stamp": "", + "relationship": "many-to-many", + "memory": 0.0, + "horizon": 0.0, + "lagged_targets": false + } + ], + "parent": null + }, + "peripheral": {} } -} \ No newline at end of file +} diff --git a/tests/integration/data/robot/expected.container.json b/tests/integration/data/robot/expected.container.json index 3b9a1c0..dadde8b 100644 --- a/tests/integration/data/robot/expected.container.json +++ b/tests/integration/data/robot/expected.container.json @@ -5,7 +5,7 @@ "full": { "name": "full", "path": "container/peripheral/full.parquet", - "profile": { + "column_profile": { "rowid": { "name": "rowid", "role": "time_stamp", @@ -1759,7 +1759,7 @@ "train": { "name": "full", "path": "container/subsets/train.full.parquet", - "profile": { + "column_profile": { "rowid": { "name": "rowid", "role": "time_stamp", @@ -3511,7 +3511,7 @@ "validation": { "name": "full", "path": "container/subsets/validation.full.parquet", - "profile": { + "column_profile": { "rowid": { "name": "rowid", "role": "time_stamp", @@ -5263,7 +5263,7 @@ "test": { "name": "full", "path": "container/subsets/test.full.parquet", - "profile": { + "column_profile": { "rowid": { "name": "rowid", "role": "time_stamp", @@ -7014,4 +7014,4 @@ } }, "deep_copy": false -} \ No newline at end of file +} diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json index 6137b76..df8eb6e 100644 --- a/tests/integration/data/robot/expected.pipeline.json +++ b/tests/integration/data/robot/expected.pipeline.json @@ -1,5 +1,5 @@ { - "id": "WYtzAK", + "id": "kSE7Uw", "predictions": { "train": { "name": "train", @@ -18,7 +18,7 @@ "train": { "name": "features.train", "path": "pipeline/feature_sets/features.train.parquet", - "profile": { + "column_profile": { "f_x": { "name": "f_x", "role": "target", @@ -2292,7 +2292,7 @@ "validation": { "name": "features.validation", "path": "pipeline/feature_sets/features.validation.parquet", - "profile": { + "column_profile": { "f_x": { "name": "f_x", "role": "target", @@ -4566,7 +4566,7 @@ "test": { "name": "features.test", "path": "pipeline/feature_sets/features.test.parquet", - "profile": { + "column_profile": { "f_x": { "name": "f_x", "role": "target", @@ -6837,5 +6837,365 @@ } } } + }, + "feature_learners": [ + { + "allow_null_weights": false, + "delta_t": 0.0, + "gamma": 0.0, + "loss_function": "SquareLoss", + "max_depth": 3, + "min_df": 30, + "min_num_samples": 1, + "num_features": 10, + "num_subfeatures": 100, + "num_threads": 0, + "propositionalization": { + "aggregation": [ + "MAX", + "SUM", + "AVG", + "STDDEV", + "MIN", + "COUNT MINUS COUNT DISTINCT", + "COUNT", + "LAST", + "COUNT DISTINCT", + "MEDIAN", + "FIRST", + "TREND", + "MODE" + ], + "delta_t": 0.0, + "loss_function": "SquareLoss", + "max_lag": 0, + "min_df": 30, + "n_most_frequent": 0, + "num_features": 200, + "num_threads": 0, + "sampling_factor": 1.0, + "silent": true, + "vocab_size": 500, + "type": "fast_prop" + }, + "reg_lambda": 0.0, + "sampling_factor": 1.0, + "seed": 5543, + "shrinkage": 0.1, + "silent": true, + "vocab_size": 500, + "type": "relboost" + } + ], + "feature_selectors": [], + "include_categorical": false, + "is_classification": false, + "is_regression": true, + "loss_function": "SquareLoss", + "peripheral": [ + { + "name": "full", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": null + } + ], + "predictors": [ + { + "booster": "gbtree", + "colsample_bylevel": 1.0, + "colsample_bytree": 1.0, + "early_stopping_rounds": 10, + "external_memory": false, + "gamma": 0.0, + "learning_rate": 0.1, + "max_delta_step": 0.0, + "max_depth": 3, + "min_child_weights": 1.0, + "n_estimators": 100, + "normalize_type": "tree", + "num_parallel_tree": 1, + "n_jobs": 1, + "objective": "reg:squarederror", + "one_drop": false, + "rate_drop": 0.0, + "reg_alpha": 0.0, + "reg_lambda": 1.0, + "sample_type": "uniform", + "silent": true, + "skip_drop": 0.0, + "subsample": 1.0, + "type": "xgboost_regressor" + } + ], + "preprocessors": [], + "share_selected_features": 0.5, + "tags": [ + "container-RYJL5w" + ], + "targets": [ + "f_x", + "f_y", + "f_z" + ], + "data_model": { + "population": { + "name": "population", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [ + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "98", + "99", + "100", + "101", + "102", + "103", + "104", + "105", + "106" + ], + "target": [ + "f_x", + "f_y", + "f_z" + ], + "text": [], + "time_stamp": [ + "rowid" + ], + "unused_float": [], + "unused_string": [] + }, + "joins": [ + { + "right": { + "name": "full", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [ + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "98", + "99", + "100", + "101", + "102", + "103", + "104", + "105", + "106" + ], + "target": [ + "f_x", + "f_y", + "f_z" + ], + "text": [], + "time_stamp": [ + "rowid" + ], + "unused_float": [], + "unused_string": [] + }, + "joins": [], + "parent": "population" + }, + "on": [ + [ + null, + null + ] + ], + "time_stamps": [ + "rowid", + "rowid" + ], + "upper_time_stamp": "", + "relationship": "many-to-many", + "memory": 30.0, + "horizon": 0.0, + "lagged_targets": false + } + ], + "parent": null + }, + "peripheral": {} } -} \ No newline at end of file +} diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ec0d544..ae80cd3 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,7 +1,7 @@ import copy import dataclasses import re -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from pathlib import Path from typing import Protocol @@ -11,22 +11,33 @@ import pytest import pytest_mock from duckdb import DuckDBPyConnection -from getml.data import Container, DataFrame, Subset, View +from getml import feature_learning as getml_feature_learner +from getml import predictors as getml_predictor +from getml import preprocessors as getml_preprocessor +from getml.data import Container, DataFrame, Placeholder, Subset, View +from getml.feature_learning.loss_functions import CROSSENTROPYLOSS from getml.pipeline import Pipeline from numpy.typing import NDArray +from getml_io.getml.feature_learning import FastProp +from getml_io.getml.predictors import LinearRegression +from getml_io.getml.preprocessors import CategoryTrimmer from getml_io.getml.project import Project from getml_io.getml.project_information import ProjectInformation +from getml_io.getml.relationships import Relationship +from getml_io.getml.roles import Role, Roles from getml_io.metadata.container_information import ContainerInformation +from getml_io.metadata.data_model_information import DataModelInformation from getml_io.metadata.dataframe_information import ( ColumnProfile, ColumnStatisticsNumerical, DataFrameInformation, - Role, ) -from getml_io.metadata.feature_sets import FeatureSets -from getml_io.metadata.pipeline_information import PipelineInformation -from getml_io.metadata.prediction_results import PredictionResults +from getml_io.metadata.pipeline_information import LossFunction, PipelineInformation +from getml_io.metadata.placeholder_information import ( + JoinInformation, + PlaceholderInformation, +) from getml_io.metadata.table_information import TableInformation from getml_io.utils.convert import assume_is_str @@ -129,7 +140,7 @@ def container_information_empty(tmp_path: Path) -> ContainerInformation: @pytest.fixture -def profile_default() -> ColumnProfile: +def column_profile_default() -> ColumnProfile: return ColumnProfile( name="default", role=Role.NUMERICAL, @@ -152,36 +163,36 @@ def profile_default() -> ColumnProfile: @pytest.fixture def dataframe_information_population( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="population", path=tmp_path / "population.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_peripheral( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="peripheral", path=tmp_path / "peripheral.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_subset( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="subset", path=tmp_path / "subset.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @@ -245,36 +256,36 @@ def mock_dataframe_train(mock_dataframe: DataFrame) -> DataFrame: @pytest.fixture def dataframe_information_train( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_train", path=tmp_path / "dataframe_train.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_test( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_test", path=tmp_path / "dataframe_test.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_validation( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_validation", path=tmp_path / "dataframe_validation.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @@ -295,22 +306,72 @@ def mock_dataframe_validation(mock_dataframe: DataFrame) -> DataFrame: @pytest.fixture def dataframe_information( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="dataframe_name", path=tmp_path / "dataframe_name.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, + ) + + +@pytest.fixture +def roles_empty() -> Roles: + return Roles( + categorical=[], + join_key=[], + numerical=[], + target=[], + text=[], + time_stamp=[], + unused_float=[], + unused_string=[], + ) + + +@pytest.fixture +def placeholder_information_empty(roles_empty: Roles) -> PlaceholderInformation: + return PlaceholderInformation( + name="placeholder_empty", + roles=roles_empty, + joins=[], + parent=None, + ) + + +@pytest.fixture +def data_model_information_empty( + placeholder_information_empty: PlaceholderInformation, +) -> DataModelInformation: + return DataModelInformation( + population=placeholder_information_empty, + peripheral={}, ) @pytest.fixture -def pipeline_information_empty(tmp_path: Path) -> PipelineInformation: +def pipeline_information_empty( + tmp_path: Path, + data_model_information_empty: DataModelInformation, +) -> PipelineInformation: return PipelineInformation( id="pipeline_empty_id", - predictions=PredictionResults(), - feature_sets=FeatureSets(), + predictions={}, + feature_sets={}, path=tmp_path, + feature_learners=[], + feature_selectors=[], + include_categorical=False, + is_classification=False, + is_regression=True, + loss_function=LossFunction.CROSS_ENTROPY_LOSS, + peripheral=[], + predictors=[], + preprocessors=[], + share_selected_features=0.0, + tags=[], + targets=[], + data_model=data_model_information_empty, ) @@ -338,6 +399,29 @@ def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFra return dataframe pipeline.transform = pipeline_transform + pipeline.loss_function = CROSSENTROPYLOSS + pipeline.peripheral = [Placeholder("placeholder_peripheral")] + pipeline.data_model = mocker.MagicMock() + pipeline.data_model.population = Placeholder("placeholder_population") + pipeline.data_model.peripheral = {} + pipeline.feature_learners = [ + getml_feature_learner.FastProp(), + ] + pipeline.feature_selectors = [ + getml_predictor.LinearRegression(), + ] + pipeline.predictors = [ + getml_predictor.LinearRegression(), + ] + pipeline.preprocessors = [ + getml_preprocessor.CategoryTrimmer(), + ] + pipeline.include_categorical = False + pipeline.is_classification = False + pipeline.is_regression = True + pipeline.share_selected_features = 0.0 + pipeline.tags = ["test_tag"] + pipeline.targets = ["test_target"] return pipeline @@ -384,74 +468,162 @@ def table_information_validation( @pytest.fixture def dataframe_information_features_test( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="features.test", path=tmp_path / "features.test.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @pytest.fixture def dataframe_information_features_validation( tmp_path: Path, - profile_default: ColumnProfile, + column_profile_default: ColumnProfile, ) -> DataFrameInformation: return DataFrameInformation( name="features.validation", path=tmp_path / "features.validation.parquet", - profile={"default": profile_default}, + column_profile={"default": column_profile_default}, ) @pytest.fixture -def pipeline_information( +def placeholder_information(roles_empty: Roles) -> PlaceholderInformation: + placeholder_right = PlaceholderInformation( + name="placeholder_right", + roles=roles_empty, + joins=[], + parent=None, + ) + join_information = JoinInformation( + right=placeholder_right, + on=[("left_column", "right_column")], + time_stamps="now", + upper_time_stamp="now", + relationship=Relationship.ONE_TO_MANY, + memory=1.0, + horizon=1.0, + lagged_targets=True, + ) + placeholder_parent = PlaceholderInformation( + name="placeholder_parent", + roles=roles_empty, + joins=[], + parent=None, + ) + return PlaceholderInformation( + name="placeholder_name", + roles=roles_empty, + joins=[join_information], + parent=placeholder_parent.name, + ) + + +@pytest.fixture +def data_model_information( + placeholder_information: PlaceholderInformation, +) -> DataModelInformation: + return DataModelInformation( + population=placeholder_information, + peripheral={placeholder_information.name: [placeholder_information]}, + ) + + +@pytest.fixture +def fast_prop() -> FastProp: + return FastProp( + aggregation=set(), + delta_t=0.0, + loss_function=CROSSENTROPYLOSS, + max_lag=0, + min_df=0, + n_most_frequent=0, + num_features=0, + num_threads=0, + sampling_factor=0.0, + silent=True, + vocab_size=0, + ) + + +@pytest.fixture +def linear_regression() -> LinearRegression: + return LinearRegression( + learning_rate=0.0, + reg_lambda=0.0, + ) + + +@pytest.fixture +def category_trimmer() -> CategoryTrimmer: + return CategoryTrimmer( + max_num_categories=0, + min_freq=0, + ) + + +@pytest.fixture +def pipeline_information( # noqa: PLR0913 tmp_path: Path, table_information_test: TableInformation, table_information_validation: TableInformation, dataframe_information_features_test: DataFrameInformation, dataframe_information_features_validation: DataFrameInformation, + data_model_information: DataModelInformation, + fast_prop: FastProp, + linear_regression: LinearRegression, + category_trimmer: CategoryTrimmer, ) -> PipelineInformation: return PipelineInformation( id="pipeline_id", - predictions=PredictionResults( - { - "test": dataclasses.replace( - table_information_test, - path=tmp_path - / "pipeline" - / "predictions" - / table_information_test.path.name, - ), - "validation": dataclasses.replace( - table_information_validation, - path=tmp_path - / "pipeline" - / "predictions" - / table_information_validation.path.name, - ), - }, - ), - feature_sets=FeatureSets( - { - "test": dataclasses.replace( - dataframe_information_features_test, - path=tmp_path - / "pipeline" - / "feature_sets" - / dataframe_information_features_test.path.name, - ), - "validation": dataclasses.replace( - dataframe_information_features_validation, - path=tmp_path - / "pipeline" - / "feature_sets" - / dataframe_information_features_validation.path.name, - ), - }, - ), + predictions={ + "test": dataclasses.replace( + table_information_test, + path=tmp_path + / "pipeline" + / "predictions" + / table_information_test.path.name, + ), + "validation": dataclasses.replace( + table_information_validation, + path=tmp_path + / "pipeline" + / "predictions" + / table_information_validation.path.name, + ), + }, + feature_sets={ + "test": dataclasses.replace( + dataframe_information_features_test, + path=tmp_path + / "pipeline" + / "feature_sets" + / dataframe_information_features_test.path.name, + ), + "validation": dataclasses.replace( + dataframe_information_features_validation, + path=tmp_path + / "pipeline" + / "feature_sets" + / dataframe_information_features_validation.path.name, + ), + }, path=tmp_path, + feature_learners=[fast_prop], + feature_selectors=[linear_regression], + include_categorical=False, + is_classification=False, + is_regression=True, + loss_function=LossFunction.CROSS_ENTROPY_LOSS, + peripheral=[], + predictors=[linear_regression], + preprocessors=[category_trimmer], + share_selected_features=0.0, + tags=["test_tag"], + targets=["test_target"], + data_model=data_model_information, ) @@ -495,7 +667,7 @@ def generate_raw_summary_statistics_pd(dataframe: DataFrame | View) -> pd.DataFr class MockDuckDBExecuteFactory(Protocol): - def __call__(self, dataframes_by_path: dict[Path, DataFrame | View]) -> None: ... + def __call__(self, dataframes_by_path: Mapping[Path, DataFrame | View]) -> None: ... REGEX_READ_PARQUET = re.compile(r"read_parquet\(\"(?P[^\"]+)\"\)") @@ -506,7 +678,9 @@ def mock_duckdb_execute_factory( mocker: pytest_mock.MockerFixture, tmp_path: Path, ) -> MockDuckDBExecuteFactory: - def mock_duckdb_execute(dataframes_by_path: dict[Path, DataFrame | View]) -> None: + def mock_duckdb_execute( + dataframes_by_path: Mapping[Path, DataFrame | View], + ) -> None: connection_context_manager = mocker.MagicMock(DuckDBPyConnection) connection = mocker.MagicMock(DuckDBPyConnection) connection_context_manager.__enter__.return_value = connection @@ -515,7 +689,10 @@ def mock_duckdb_execute(dataframes_by_path: dict[Path, DataFrame | View]) -> Non return_value=connection_context_manager, ) - def mocked_execute(_query: str, parameters: list[str]) -> DuckDBPyConnection: + def mocked_execute( + _query: str, + parameters: Sequence[str], + ) -> DuckDBPyConnection: assert len(parameters) == 1 current_parquet_path = Path(parameters[0]) mock_execution = mocker.MagicMock(DuckDBPyConnection) diff --git a/tests/unit/metadata/test_container_information.py b/tests/unit/metadata/test_container_information.py index a4d78ad..8a601b4 100644 --- a/tests/unit/metadata/test_container_information.py +++ b/tests/unit/metadata/test_container_information.py @@ -3,8 +3,9 @@ import pytest from pydantic import TypeAdapter +from getml_io.getml.roles import Role from getml_io.metadata.container_information import ContainerInformation -from getml_io.metadata.dataframe_information import Role +from tests.unit.types import ColumnProfileType, ContainerInformationType @pytest.mark.unit @@ -16,7 +17,14 @@ def test_serialize_model_without_dataframe_information( container_information_empty, ) # Then - assert serialized_model == { + expected_serialized_container_information = ( + _get_expected_empty_container_information() + ) + assert serialized_model == expected_serialized_container_information + + +def _get_expected_empty_container_information() -> ContainerInformationType: + return { "id": "container_empty_id", "population": None, "peripheral": {}, @@ -35,7 +43,14 @@ def test_serialize_model( ) # Then - expected_profile = { + expected_serialized_container_information = ( + _get_expected_serialized_container_information() + ) + assert serialized_model == expected_serialized_container_information + + +def _get_expected_serialized_container_information() -> ContainerInformationType: + expected_column_profile: ColumnProfileType = { "default": { "name": "default", "role": Role.NUMERICAL, @@ -55,35 +70,35 @@ def test_serialize_model( }, }, } - assert serialized_model == { + return { "id": "container_id", "population": { "name": "population", "path": Path("population.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, "peripheral": { "peripheral": { "name": "peripheral", "path": Path("peripheral.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, }, "subsets": { "train": { "name": "dataframe_train", "path": Path("dataframe_train.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, "test": { "name": "dataframe_test", "path": Path("dataframe_test.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, "validation": { "name": "dataframe_validation", "path": Path("dataframe_validation.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, }, "deep_copy": True, diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index 15a1f84..fd69582 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -3,8 +3,10 @@ import pytest from pydantic import TypeAdapter -from getml_io.metadata.dataframe_information import Role -from getml_io.metadata.pipeline_information import PipelineInformation +from getml_io.getml.relationships import Relationship +from getml_io.getml.roles import Role +from getml_io.metadata.pipeline_information import LossFunction, PipelineInformation +from tests.unit.types import ColumnProfileType, PipelineInformationType @pytest.mark.unit @@ -17,10 +19,47 @@ def test_serialize_model_without_transforms( ) # Then - assert serialized_model == { + expected_serialized_pipeline_information = ( + _get_expected_serialized_empty_pipeline_information() + ) + assert serialized_model == expected_serialized_pipeline_information + + +def _get_expected_serialized_empty_pipeline_information() -> PipelineInformationType: + return { "id": "pipeline_empty_id", "predictions": {}, "feature_sets": {}, + "data_model": { + "peripheral": {}, + "population": { + "joins": [], + "name": "placeholder_empty", + "parent": None, + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + }, + "feature_learners": [], + "feature_selectors": [], + "include_categorical": False, + "is_classification": False, + "is_regression": True, + "loss_function": LossFunction.CROSS_ENTROPY_LOSS, + "peripheral": [], + "predictors": [], + "preprocessors": [], + "share_selected_features": 0.0, + "tags": [], + "targets": [], } @@ -34,7 +73,17 @@ def test_serialize_model( ) # Then - expected_profile = { + serialized_model["feature_learners"][0]["aggregation"] = list( + serialized_model["feature_learners"][0]["aggregation"], + ) + expected_serialized_pipeline_information = ( + _get_expected_serialized_pipeline_information() + ) + assert serialized_model == expected_serialized_pipeline_information + + +def _get_expected_serialized_pipeline_information() -> PipelineInformationType: + expected_column_profile: ColumnProfileType = { "default": { "name": "default", "role": Role.NUMERICAL, @@ -54,7 +103,7 @@ def test_serialize_model( }, }, } - assert serialized_model == { + return { "id": "pipeline_id", "predictions": { "test": { @@ -70,12 +119,157 @@ def test_serialize_model( "test": { "name": "features.test", "path": Path("pipeline/feature_sets/features.test.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, "validation": { "name": "features.validation", "path": Path("pipeline/feature_sets/features.validation.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, + }, + }, + "data_model": { + "peripheral": { + "placeholder_name": [ + { + "joins": [ + { + "horizon": 1.0, + "lagged_targets": True, + "memory": 1.0, + "on": [ + ( + "left_column", + "right_column", + ), + ], + "relationship": Relationship.ONE_TO_MANY, + "right": { + "joins": [], + "name": "placeholder_right", + "parent": None, + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + "time_stamps": "now", + "upper_time_stamp": "now", + }, + ], + "name": "placeholder_name", + "parent": "placeholder_parent", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + ], + }, + "population": { + "joins": [ + { + "horizon": 1.0, + "lagged_targets": True, + "memory": 1.0, + "on": [ + ( + "left_column", + "right_column", + ), + ], + "relationship": Relationship.ONE_TO_MANY, + "right": { + "joins": [], + "name": "placeholder_right", + "parent": None, + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + "time_stamps": "now", + "upper_time_stamp": "now", + }, + ], + "name": "placeholder_name", + "parent": "placeholder_parent", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, }, }, + "feature_learners": [ + { + "aggregation": [], + "delta_t": 0.0, + "loss_function": "CrossEntropyLoss", + "max_lag": 0, + "min_df": 0, + "n_most_frequent": 0, + "num_features": 0, + "num_threads": 0, + "sampling_factor": 0.0, + "silent": True, + "type": "fast_prop", + "vocab_size": 0, + }, + ], + "feature_selectors": [ + { + "learning_rate": 0.0, + "reg_lambda": 0.0, + "type": "linear_regression", + }, + ], + "include_categorical": False, + "is_classification": False, + "is_regression": True, + "loss_function": LossFunction.CROSS_ENTROPY_LOSS, + "peripheral": [], + "predictors": [ + { + "learning_rate": 0.0, + "reg_lambda": 0.0, + "type": "linear_regression", + }, + ], + "preprocessors": [ + { + "max_num_categories": 0, + "min_freq": 0, + "type": "category_trimmer", + }, + ], + "share_selected_features": 0.0, + "tags": [ + "test_tag", + ], + "targets": [ + "test_target", + ], } diff --git a/tests/unit/metadata/test_utils.py b/tests/unit/metadata/test_utils.py index 5d5054a..90dcbb5 100644 --- a/tests/unit/metadata/test_utils.py +++ b/tests/unit/metadata/test_utils.py @@ -79,3 +79,24 @@ def test_derive_instance_with_relative_path_not_relative( non_relative_path, error_factory, ) + + +@pytest.mark.unit +def test_derive_instance_with_relative_path_not_dataclass( + tmp_path: Path, +) -> None: + # Given + class NotADataclass: + name: str = "not_a_dataclass_instance" + path: Path = Path("not_a_dataclass_instance.parquet") + + # When / Then + with pytest.raises( + TypeError, + match=r"Instance must be a dataclass:", + ): + _ = derive_instance_with_relative_path( + NotADataclass(), + tmp_path, + DataFrameInformationPathNotRelativeError, + ) diff --git a/tests/unit/serialize/test_container.py b/tests/unit/serialize/test_container.py index 6685170..24adb4c 100644 --- a/tests/unit/serialize/test_container.py +++ b/tests/unit/serialize/test_container.py @@ -10,6 +10,7 @@ assume_is_optional_dataframe_or_view, ) from tests.unit.conftest import MockDuckDBExecuteFactory +from tests.unit.types import ColumnProfileType, ContainerInformationType, StatisticsType @pytest.mark.unit @@ -95,6 +96,12 @@ def test_serialize_container( assert not container_information.deep_copy + expected_container_json_content = _get_expected_container_information() + container_json = expected_container_information_json_path.read_text() + assert json.loads(container_json) == expected_container_json_content + + +def _get_expected_container_information() -> ContainerInformationType: expected_statistics_categorical = { "approx_unique": 0, "column_type": "VARCHAR", @@ -104,7 +111,7 @@ def test_serialize_container( "null_percentage": None, "type": "categorical", } - expected_statistics_numerical = { + expected_statistics_numerical: StatisticsType = { "approx_unique": 0, "avg": None, "column_type": "DOUBLE", @@ -118,7 +125,7 @@ def test_serialize_container( "std": None, "type": "numerical", } - expected_profile = { + expected_column_profile: ColumnProfileType = { "Categorical0": { "name": "Categorical0", "role": "categorical", @@ -160,44 +167,41 @@ def test_serialize_container( "statistics": expected_statistics_numerical, }, } - - expected_container_json_content = { + return { "id": "mock_container_id", "population": { "name": "mock_population_name", "path": "container/population/mock_population_name.parquet", - "profile": expected_profile, + "column_profile": expected_column_profile, }, "peripheral": { "mock_peripheral_name": { "name": "mock_peripheral_name", "path": ("container/peripheral/mock_peripheral_name.parquet"), - "profile": expected_profile, + "column_profile": expected_column_profile, }, }, "subsets": { "test": { "name": "mock_dataframe_test", "path": "container/subsets/test.mock_dataframe_test.parquet", - "profile": expected_profile, + "column_profile": expected_column_profile, }, "train": { "name": "mock_dataframe_train", "path": "container/subsets/train.mock_dataframe_train.parquet", - "profile": expected_profile, + "column_profile": expected_column_profile, }, "validation": { "name": "mock_dataframe_validation", "path": ( "container/subsets/validation.mock_dataframe_validation.parquet" ), - "profile": expected_profile, + "column_profile": expected_column_profile, }, }, "deep_copy": False, } - container_json = expected_container_information_json_path.read_text() - assert json.loads(container_json) == expected_container_json_content @pytest.mark.unit @@ -225,15 +229,16 @@ def test_serialize_container_empty( assert len(container_information.subsets) == 0 assert container_information.deep_copy - expected_container_json_content: dict[ - str, - str | bool | dict[str, dict[str, str]] | None, - ] = { + expected_container_json_content = _get_expected_empty_container_information() + container_json = expected_container_information_json_path.read_text() + assert json.loads(container_json) == expected_container_json_content + + +def _get_expected_empty_container_information() -> ContainerInformationType: + return { "id": "mock_container_empty_id", "population": None, "peripheral": {}, "subsets": {}, "deep_copy": True, } - container_json = expected_container_information_json_path.read_text() - assert json.loads(container_json) == expected_container_json_content diff --git a/tests/unit/serialize/test_dataframe_or_view.py b/tests/unit/serialize/test_dataframe_or_view.py index 86ba23d..01e5f03 100644 --- a/tests/unit/serialize/test_dataframe_or_view.py +++ b/tests/unit/serialize/test_dataframe_or_view.py @@ -6,11 +6,11 @@ from getml.data import DataFrame from pydantic import TypeAdapter +from getml_io.getml.roles import Role from getml_io.metadata.dataframe_information import ( ColumnProfile, ColumnStatisticsCategorical, ColumnStatisticsNumerical, - Role, ) from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view from getml_io.serialize.exception import ( @@ -52,6 +52,13 @@ def test_serialize_dataframe_or_view( assert serialized_info.path == expected_parquet_path mock_dataframe.to_parquet.assert_called_once_with(str(expected_parquet_path)) + expected_column_profile = _get_expected_column_profile() + assert column_profiles_to_json( + serialized_info.column_profile, + ) == column_profiles_to_json(expected_column_profile) + + +def _get_expected_column_profile() -> dict[str, ColumnProfile]: nan = float("nan") expected_statistics_categorical = ColumnStatisticsCategorical( count=0, @@ -76,7 +83,7 @@ def test_serialize_dataframe_or_view( column_type="DOUBLE", type="numerical", ) - expected_profile = { + return { "Categorical0": ColumnProfile( name="Categorical0", role=Role.CATEGORICAL, @@ -118,15 +125,14 @@ def test_serialize_dataframe_or_view( statistics=expected_statistics_numerical, ), } - assert profile_to_json(serialized_info.profile) == profile_to_json(expected_profile) -def profile_to_json( - profile: Mapping[str, ColumnProfile], +def column_profiles_to_json( + column_profiles_by_name: Mapping[str, ColumnProfile], ) -> dict[str, bytes]: return { column_name: ColumnProfileAdapter.dump_json(column_profile) - for column_name, column_profile in profile.items() + for column_name, column_profile in column_profiles_by_name.items() } diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index 87cc522..5639d6e 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -1,15 +1,48 @@ from pathlib import Path import pytest +from getml import feature_learning as getml_feature_learner +from getml import predictors as getml_predictor +from getml import preprocessors as getml_preprocessor from getml.data import Container, DataFrame from getml.pipeline import Pipeline +from getml_io.getml.feature_learning import ( + Fastboost, + FastProp, + FeatureLearner, + Multirel, + Relboost, + RelMT, +) +from getml_io.getml.predictors import ( + LinearRegression, + LogisticRegression, + Predictor, + ScaleGBMClassifier, + ScaleGBMRegressor, + XGBoostClassifier, + XGBoostRegressor, +) +from getml_io.getml.preprocessors import ( + CategoryTrimmer, + EmailDomain, + Imputation, + Mapping, + Preprocessor, + Seasonal, + Substring, + TextFieldSplitter, +) from getml_io.metadata.feature_sets import FeatureSets from getml_io.metadata.prediction_results import PredictionResults from getml_io.serialize.pipeline import ( + serialize_feature_learner, serialize_feature_sets, serialize_pipeline, serialize_predictions, + serialize_predictor, + serialize_preprocessor, ) from tests.unit.conftest import MockDuckDBExecuteFactory @@ -176,3 +209,87 @@ def _assert_features_valid( expected_path = path / f"features.{subset_name}.parquet" assert features[subset_name].path == expected_path + + +@pytest.mark.unit +@pytest.mark.parametrize( + ("feature_learner", "expected_result_type"), + [ + (getml_feature_learner.Fastboost(), Fastboost), + (getml_feature_learner.FastProp(), FastProp), + (getml_feature_learner.Multirel(), Multirel), + (getml_feature_learner.Relboost(), Relboost), + (getml_feature_learner.RelMT(), RelMT), + ], +) +def test_serialize_feature_learner( + feature_learner: getml_feature_learner.Fastboost + | getml_feature_learner.FastProp + | getml_feature_learner.Multirel + | getml_feature_learner.Relboost + | getml_feature_learner.RelMT, + expected_result_type: type[FeatureLearner], +) -> None: + # When + serialized_feature_learner = serialize_feature_learner(feature_learner) + + # Then + assert isinstance(serialized_feature_learner, expected_result_type) + + +@pytest.mark.unit +@pytest.mark.parametrize( + ("predictor", "expected_result_type"), + [ + (getml_predictor.LinearRegression(), LinearRegression), + (getml_predictor.LogisticRegression(), LogisticRegression), + (getml_predictor.ScaleGBMClassifier(), ScaleGBMClassifier), + (getml_predictor.ScaleGBMRegressor(), ScaleGBMRegressor), + (getml_predictor.XGBoostClassifier(), XGBoostClassifier), + (getml_predictor.XGBoostRegressor(), XGBoostRegressor), + ], +) +def test_serialize_predictor( + predictor: getml_predictor.LinearRegression + | getml_predictor.LogisticRegression + | getml_predictor.ScaleGBMClassifier + | getml_predictor.ScaleGBMRegressor + | getml_predictor.XGBoostClassifier + | getml_predictor.XGBoostRegressor, + expected_result_type: type[Predictor], +) -> None: + # When + serialized_predictor = serialize_predictor(predictor) + + # Then + assert isinstance(serialized_predictor, expected_result_type) + + +@pytest.mark.unit +@pytest.mark.parametrize( + ("preprocessor", "expected_result_type"), + [ + (getml_preprocessor.CategoryTrimmer(), CategoryTrimmer), + (getml_preprocessor.EmailDomain(), EmailDomain), + (getml_preprocessor.Imputation(), Imputation), + (getml_preprocessor.Mapping(), Mapping), + (getml_preprocessor.Seasonal(), Seasonal), + (getml_preprocessor.Substring(0, 1), Substring), + (getml_preprocessor.TextFieldSplitter(), TextFieldSplitter), + ], +) +def test_serialize_preprocessor( + preprocessor: getml_preprocessor.CategoryTrimmer + | getml_preprocessor.EmailDomain + | getml_preprocessor.Imputation + | getml_preprocessor.Mapping + | getml_preprocessor.Seasonal + | getml_preprocessor.Substring + | getml_preprocessor.TextFieldSplitter, + expected_result_type: type[Preprocessor], +) -> None: + # When + serialized_preprocessor = serialize_preprocessor(preprocessor) + + # Then + assert isinstance(serialized_preprocessor, expected_result_type) diff --git a/tests/unit/serialize/test_pipeline_information.py b/tests/unit/serialize/test_pipeline_information.py index acf0008..ca9d8ed 100644 --- a/tests/unit/serialize/test_pipeline_information.py +++ b/tests/unit/serialize/test_pipeline_information.py @@ -10,6 +10,7 @@ PipelineInformationStorageError, ) from getml_io.serialize.pipeline_information import serialize_pipeline_information +from tests.unit.types import PipelineInformationType @pytest.mark.unit @@ -32,17 +33,51 @@ def test_serialize_pipeline_information( assert pipeline_information_json_path.is_file() pipeline_information_json = expected_pipeline_information_json_path.read_text() - expected_pipeline_information_json_content = { - "id": "pipeline_empty_id", - "predictions": {}, - "feature_sets": {}, - } + expected_pipeline_information_json_content = _get_expected_pipeline_information() assert ( json.loads(pipeline_information_json) == expected_pipeline_information_json_content ) +def _get_expected_pipeline_information() -> PipelineInformationType: + return { + "id": "pipeline_empty_id", + "predictions": {}, + "feature_sets": {}, + "data_model": { + "peripheral": {}, + "population": { + "joins": [], + "name": "placeholder_empty", + "parent": None, + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + }, + "feature_learners": [], + "feature_selectors": [], + "include_categorical": False, + "is_classification": False, + "is_regression": True, + "loss_function": "CrossEntropyLoss", + "peripheral": [], + "predictors": [], + "preprocessors": [], + "share_selected_features": 0.0, + "tags": [], + "targets": [], + } + + @pytest.mark.unit def test_serialize_pipeline_information_serialization_error( mocker: pytest_mock.MockerFixture, diff --git a/tests/unit/types.py b/tests/unit/types.py new file mode 100644 index 0000000..1bb5cc0 --- /dev/null +++ b/tests/unit/types.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from pathlib import Path + +from getml_io.getml.relationships import Relationship +from getml_io.metadata.pipeline_information import LossFunction + +StatisticsType = Mapping[str, str | float | int | None] +ColumnProfileType = Mapping[str, Mapping[str, str | StatisticsType]] +DataFrameInformationType = Mapping[str, str | Path | ColumnProfileType] +JoinInformationType = Mapping[ + str, + "float | bool | Sequence[tuple[str, str]] | Relationship | PlaceholderInformationType | str", # noqa: E501 +] +RolesType = Mapping[str, Sequence[str]] +PlaceholderInformationType = Mapping[ + str, + Sequence["JoinInformationType"] | str | RolesType | None, +] +DataModelInformationType = Mapping[ + str, + Mapping[ + str, + Sequence[PlaceholderInformationType], + ] + | PlaceholderInformationType, +] +ContainerInformationType = Mapping[ + str, + str + | DataFrameInformationType + | Mapping[str, DataFrameInformationType] + | bool + | None, +] + +PredictionType = Mapping[str, Mapping[str, str | Path]] +FeatureSetType = Mapping[str, Mapping[str, str | Path | ColumnProfileType]] +FeatureLearnerType = Mapping[str, float | str | int | bool | Sequence[str]] +PredictorType = Mapping[str, float | str] +PipelineInformationType = Mapping[ + str, + str + | PredictionType + | FeatureSetType + | DataModelInformationType + | Sequence[FeatureLearnerType] + | Sequence[PredictorType] + | bool + | LossFunction + | float + | Sequence[str], +]