Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "uv_build"

[project]
name = "getml-io"
version = "0.1.0"
version = "0.2.0"
description = "Library for serializing data and information from getML projects, pipelines, containers, dataframes, data-models and related components"
readme = "README.md"
authors = [
Expand Down
16 changes: 8 additions & 8 deletions src/getml_io/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import typer
from typer import Option

from getml_io.getml.project_information import ProjectInformation
from getml_io.getml.project_identification import ProjectIdentification
from getml_io.serialize.project import serialize_project
from getml_io.utils.storage import get_default_root_storage_directory

Expand Down Expand Up @@ -82,7 +82,7 @@ def serialize(
root_storage_directory: Annotated[
Path,
Option(
help="Path to the directory where the serialized project will be saved",
help="Path to the directory where the serialized project will be saved.",
prompt=False,
show_default=True,
),
Expand All @@ -97,14 +97,14 @@ def serialize(
] = False,
) -> None:
"""Serialize a getML project."""
project_information: ProjectInformation = ProjectInformation(
project_identification = ProjectIdentification(
project_name=project,
pipeline_id=pipeline,
container_id=container,
)
serialize_project(
project_information,
root_storage_directory,
_ = serialize_project(
project_identification=project_identification,
root_storage_directory=root_storage_directory,
clear_storage_directory=clear_storage_directory,
)

Expand Down Expand Up @@ -148,12 +148,12 @@ def deserialize(
] = DEFAULT_ROOT_STORAGE_DIRECTORY,
) -> None:
"""Deserialize a getML project."""
project_information: ProjectInformation = ProjectInformation(
project_identification = ProjectIdentification(
project_name=project,
pipeline_id=pipeline,
container_id=container,
)
message = f"Deserializing {project_information!r} from {root_storage_directory}"
message = f"Deserializing {project_identification!r} from {root_storage_directory}"
# TODO @urfoex: #20: Implement deserialization logic
raise NotImplementedError(message)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pydantic import BaseModel


class ProjectInformation(BaseModel, frozen=True):
class ProjectIdentification(BaseModel, frozen=True):
project_name: str
pipeline_id: str
container_id: str
110 changes: 110 additions & 0 deletions src/getml_io/metadata/column_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import annotations

from datetime import datetime
from enum import Enum
from typing import Annotated, Literal

from pydantic import BaseModel, Field

from getml_io.getml.roles import Role


class ColumnStatisticsBase(BaseModel, frozen=True):
count: int
approx_unique: int
null_percentage: float | None


class ColumnStatisticsDouble(ColumnStatisticsBase, frozen=True):
avg: float | None
min: float | None
max: float | None
q25: float | None
q50: float | None
q75: float | None
std: float | None
column_type: Literal["DOUBLE"]


class ColumnStatisticsVarchar(ColumnStatisticsBase, frozen=True):
min: str | None
max: str | None
column_type: Literal["VARCHAR"]


class ColumnStatisticsNumerical(ColumnStatisticsDouble, frozen=True):
type: Literal["numerical"] = "numerical"


class ColumnStatisticsTarget(ColumnStatisticsDouble, frozen=True):
type: Literal["target"] = "target"


class ColumnStatisticsTimeStamp(ColumnStatisticsBase, frozen=True):
avg: datetime | None
min: datetime | None
max: datetime | None
q25: datetime | None
q50: datetime | None
q75: datetime | None
column_type: Literal["TIMESTAMP_NS"]
type: Literal["time_stamp"] = "time_stamp"


class ColumnStatisticsTimeStampAsFloat(ColumnStatisticsDouble, frozen=True):
type: Literal["time_stamp_float"] = "time_stamp_float"


class ColumnStatisticsCategorical(ColumnStatisticsVarchar, frozen=True):
type: Literal["categorical"] = "categorical"


class ColumnStatisticsJoinKey(ColumnStatisticsVarchar, frozen=True):
type: Literal["join_key"] = "join_key"


class ColumnStatisticsText(ColumnStatisticsVarchar, frozen=True):
type: Literal["text"] = "text"


class ColumnStatisticsUnusedFloat(ColumnStatisticsDouble, frozen=True):
type: Literal["unused_float"] = "unused_float"


class ColumnStatisticsUnusedString(ColumnStatisticsVarchar, frozen=True):
type: Literal["unused_string"] = "unused_string"


ColumnStatistics = Annotated[
ColumnStatisticsNumerical
| ColumnStatisticsTarget
| ColumnStatisticsCategorical
| ColumnStatisticsJoinKey
| ColumnStatisticsTimeStamp
| ColumnStatisticsTimeStampAsFloat
| ColumnStatisticsText
| ColumnStatisticsUnusedFloat
| ColumnStatisticsUnusedString,
Field(discriminator="type"),
]


class ColumnType(str, Enum):
"""Column types supported by GetML-IO based on DuckDBs SUMMARIZE statistics."""

DOUBLE = "DOUBLE"
TIMESTAMP_NS = "TIMESTAMP_NS"
VARCHAR = "VARCHAR"


ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING = {
(Role.CATEGORICAL, ColumnType.VARCHAR): ColumnStatisticsCategorical,
(Role.JOIN_KEY, ColumnType.VARCHAR): ColumnStatisticsJoinKey,
(Role.NUMERICAL, ColumnType.DOUBLE): ColumnStatisticsNumerical,
(Role.TARGET, ColumnType.DOUBLE): ColumnStatisticsTarget,
(Role.TIME_STAMP, ColumnType.TIMESTAMP_NS): ColumnStatisticsTimeStamp,
(Role.TIME_STAMP, ColumnType.DOUBLE): ColumnStatisticsTimeStampAsFloat,
(Role.TEXT, ColumnType.VARCHAR): ColumnStatisticsText,
(Role.UNUSED_FLOAT, ColumnType.DOUBLE): ColumnStatisticsUnusedFloat,
(Role.UNUSED_STRING, ColumnType.VARCHAR): ColumnStatisticsUnusedString,
}
109 changes: 2 additions & 107 deletions src/getml_io/metadata/dataframe_information.py
Original file line number Diff line number Diff line change
@@ -1,94 +1,11 @@
from __future__ import annotations

from collections.abc import Mapping
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Annotated, Literal

from pydantic import BaseModel, Field
from pydantic import BaseModel

from getml_io.getml.roles import Role


class ColumnStatisticsBase(BaseModel, frozen=True):
count: int
approx_unique: int
null_percentage: float | None


class ColumnStatisticsDouble(ColumnStatisticsBase, frozen=True):
avg: float | None
min: float | None
max: float | None
q25: float | None
q50: float | None
q75: float | None
std: float | None
column_type: Literal["DOUBLE"]


class ColumnStatisticsVarchar(ColumnStatisticsBase, frozen=True):
min: str | None
max: str | None
column_type: Literal["VARCHAR"]


class ColumnStatisticsNumerical(ColumnStatisticsDouble, frozen=True):
type: Literal["numerical"] = "numerical"


class ColumnStatisticsTarget(ColumnStatisticsDouble, frozen=True):
type: Literal["target"] = "target"


class ColumnStatisticsTimeStamp(ColumnStatisticsBase, frozen=True):
avg: datetime | None
min: datetime | None
max: datetime | None
q25: datetime | None
q50: datetime | None
q75: datetime | None
column_type: Literal["TIMESTAMP_NS"]
type: Literal["time_stamp"] = "time_stamp"


class ColumnStatisticsTimeStampAsFloat(ColumnStatisticsDouble, frozen=True):
type: Literal["time_stamp_float"] = "time_stamp_float"


class ColumnStatisticsCategorical(ColumnStatisticsVarchar, frozen=True):
type: Literal["categorical"] = "categorical"


class ColumnStatisticsJoinKey(ColumnStatisticsVarchar, frozen=True):
type: Literal["join_key"] = "join_key"


class ColumnStatisticsText(ColumnStatisticsVarchar, frozen=True):
type: Literal["text"] = "text"


class ColumnStatisticsUnusedFloat(ColumnStatisticsDouble, frozen=True):
type: Literal["unused_float"] = "unused_float"


class ColumnStatisticsUnusedString(ColumnStatisticsVarchar, frozen=True):
type: Literal["unused_string"] = "unused_string"


ColumnStatistics = Annotated[
ColumnStatisticsNumerical
| ColumnStatisticsTarget
| ColumnStatisticsCategorical
| ColumnStatisticsJoinKey
| ColumnStatisticsTimeStamp
| ColumnStatisticsTimeStampAsFloat
| ColumnStatisticsText
| ColumnStatisticsUnusedFloat
| ColumnStatisticsUnusedString,
Field(discriminator="type"),
]
from getml_io.metadata.column_statistics import ColumnStatistics


class ColumnInformation(BaseModel, frozen=True):
Expand All @@ -99,29 +16,7 @@ class ColumnInformation(BaseModel, frozen=True):

class DataFrameInformation(BaseModel, frozen=True):
name: str
path: Path
columns: Mapping[str, ColumnInformation]


DataFrameInformationByName = Mapping[str, DataFrameInformation]


class ColumnType(str, Enum):
"""Column types supported by GetML-IO based on DuckDBs SUMMARIZE statistics."""

DOUBLE = "DOUBLE"
TIMESTAMP_NS = "TIMESTAMP_NS"
VARCHAR = "VARCHAR"


ROLE_TO_COLUMN_STATISTICS_TYPE_MAPPING = {
(Role.CATEGORICAL, ColumnType.VARCHAR): ColumnStatisticsCategorical,
(Role.JOIN_KEY, ColumnType.VARCHAR): ColumnStatisticsJoinKey,
(Role.NUMERICAL, ColumnType.DOUBLE): ColumnStatisticsNumerical,
(Role.TARGET, ColumnType.DOUBLE): ColumnStatisticsTarget,
(Role.TIME_STAMP, ColumnType.TIMESTAMP_NS): ColumnStatisticsTimeStamp,
(Role.TIME_STAMP, ColumnType.DOUBLE): ColumnStatisticsTimeStampAsFloat,
(Role.TEXT, ColumnType.VARCHAR): ColumnStatisticsText,
(Role.UNUSED_FLOAT, ColumnType.DOUBLE): ColumnStatisticsUnusedFloat,
(Role.UNUSED_STRING, ColumnType.VARCHAR): ColumnStatisticsUnusedString,
}
44 changes: 0 additions & 44 deletions src/getml_io/metadata/exception.py

This file was deleted.

9 changes: 9 additions & 0 deletions src/getml_io/metadata/project_information.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pydantic import BaseModel

from getml_io.metadata.container_information import ContainerInformation
from getml_io.metadata.pipeline_information import PipelineInformation


class ProjectInformation(BaseModel, frozen=True):
container_information: ContainerInformation
pipeline_information: PipelineInformation
Loading