From 2be5f11043f5f3a7ef14c504eff41453e06539cb Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 02:18:03 +0000 Subject: [PATCH 01/42] Introduce AttributeType system to replace AttributeAdapter This commit introduces a modern, extensible custom type system for DataJoint: **New Features:** - AttributeType base class with encode()/decode() methods - Global type registry with @register_type decorator - Entry point discovery for third-party type packages (datajoint.types) - Type chaining: dtype can reference another custom type - Automatic validation via validate() method before encoding - resolve_dtype() for resolving chained types **API Changes:** - New: dj.AttributeType, dj.register_type, dj.list_types - AttributeAdapter is now deprecated (backward-compatible wrapper) - Feature flag DJ_SUPPORT_ADAPTED_TYPES is no longer required **Entry Point Specification:** Third-party packages can declare types in pyproject.toml: [project.entry-points."datajoint.types"] zarr_array = "dj_zarr:ZarrArrayType" **Migration Path:** Old AttributeAdapter subclasses continue to work but emit DeprecationWarning. Migrate to AttributeType with encode/decode. --- src/datajoint/__init__.py | 6 +- src/datajoint/attribute_adapter.py | 188 +++++++++++-- src/datajoint/attribute_type.py | 413 +++++++++++++++++++++++++++++ src/datajoint/declare.py | 4 +- src/datajoint/fetch.py | 5 +- src/datajoint/heading.py | 43 ++- src/datajoint/table.py | 4 +- tests/conftest.py | 11 +- tests/test_adapted_attributes.py | 22 +- tests/test_attribute_type.py | 347 ++++++++++++++++++++++++ 10 files changed, 993 insertions(+), 50 deletions(-) create mode 100644 src/datajoint/attribute_type.py create mode 100644 tests/test_attribute_type.py diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 0f8123c66..feff400bf 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -45,7 +45,10 @@ "kill", "MatCell", "MatStruct", - "AttributeAdapter", + "AttributeType", + "register_type", + "list_types", + "AttributeAdapter", # Deprecated, use AttributeType "errors", "DataJointError", "key", @@ -57,6 +60,7 @@ from . import errors from .admin import kill from .attribute_adapter import AttributeAdapter +from .attribute_type import AttributeType, list_types, register_type from .blob import MatCell, MatStruct from .cli import cli from .connection import Connection, conn diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py index 12a34f27e..5c687bff6 100644 --- a/src/datajoint/attribute_adapter.py +++ b/src/datajoint/attribute_adapter.py @@ -1,61 +1,191 @@ +""" +Legacy attribute adapter module. + +This module provides backward compatibility for the deprecated AttributeAdapter class. +New code should use :class:`datajoint.AttributeType` instead. + +.. deprecated:: 0.15 + Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods. +""" + import re +import warnings +from typing import Any -from .errors import DataJointError, _support_adapted_types +from .attribute_type import AttributeType, get_type, is_type_registered +from .errors import DataJointError -class AttributeAdapter: +class AttributeAdapter(AttributeType): """ - Base class for adapter objects for user-defined attribute types. + Legacy base class for attribute adapters. + + .. deprecated:: 0.15 + Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods instead. + + This class provides backward compatibility for existing adapters that use + the ``attribute_type``, ``put()``, and ``get()`` API. + + Migration guide:: + + # Old style (deprecated): + class GraphAdapter(dj.AttributeAdapter): + attribute_type = "longblob" + + def put(self, graph): + return list(graph.edges) + + def get(self, edges): + return nx.Graph(edges) + + # New style (recommended): + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return list(graph.edges) + + def decode(self, edges, *, key=None): + return nx.Graph(edges) """ + # Subclasses can set this as a class attribute instead of property + attribute_type: str = None # type: ignore + + def __init__(self): + # Emit deprecation warning on instantiation + warnings.warn( + f"{self.__class__.__name__} uses the deprecated AttributeAdapter API. " + "Migrate to AttributeType with encode/decode methods.", + DeprecationWarning, + stacklevel=2, + ) + @property - def attribute_type(self): + def type_name(self) -> str: """ - :return: a supported DataJoint attribute type to use; e.g. "longblob", "blob@store" + Infer type name from class name for legacy adapters. + + Legacy adapters were identified by their variable name in the context dict, + not by a property. For backward compatibility, we use the lowercase class name. """ - raise NotImplementedError("Undefined attribute adapter") + # Check if a _type_name was explicitly set (for context-based lookup) + if hasattr(self, "_type_name"): + return self._type_name + # Fall back to class name + return self.__class__.__name__.lower() - def get(self, value): + @property + def dtype(self) -> str: + """Map legacy attribute_type to new dtype property.""" + attr_type = self.attribute_type + if attr_type is None: + raise NotImplementedError( + f"{self.__class__.__name__} must define 'attribute_type' " + "(or migrate to AttributeType with 'dtype')" + ) + return attr_type + + def encode(self, value: Any, *, key: dict | None = None) -> Any: + """Delegate to legacy put() method.""" + return self.put(value) + + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """Delegate to legacy get() method.""" + return self.get(stored) + + def put(self, obj: Any) -> Any: """ - convert value retrieved from the the attribute in a table into the adapted type + Convert an object of the adapted type into a storable value. + + .. deprecated:: 0.15 + Override ``encode()`` instead. - :param value: value from the database + Args: + obj: An object of the adapted type. - :return: object of the adapted type + Returns: + Value to store in the database. """ - raise NotImplementedError("Undefined attribute adapter") + raise NotImplementedError( + f"{self.__class__.__name__} must implement put() or migrate to encode()" + ) - def put(self, obj): + def get(self, value: Any) -> Any: """ - convert an object of the adapted type into a value that DataJoint can store in a table attribute + Convert a value from the database into the adapted type. + + .. deprecated:: 0.15 + Override ``decode()`` instead. + + Args: + value: Value from the database. - :param obj: an object of the adapted type - :return: value to store in the database + Returns: + Object of the adapted type. """ - raise NotImplementedError("Undefined attribute adapter") + raise NotImplementedError( + f"{self.__class__.__name__} must implement get() or migrate to decode()" + ) -def get_adapter(context, adapter_name): +def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: """ - Extract the AttributeAdapter object by its name from the context and validate. + Get an attribute type/adapter by name. + + This function provides backward compatibility by checking both: + 1. The global type registry (new system) + 2. The schema context dict (legacy system) + + Args: + context: Schema context dictionary (for legacy adapters). + adapter_name: The adapter/type name, with or without angle brackets. + + Returns: + The AttributeType instance. + + Raises: + DataJointError: If the adapter is not found or invalid. """ - if not _support_adapted_types(): - raise DataJointError("Support for Adapted Attribute types is disabled.") adapter_name = adapter_name.lstrip("<").rstrip(">") + + # First, check the global type registry (new system) + if is_type_registered(adapter_name): + return get_type(adapter_name) + + # Fall back to context-based lookup (legacy system) + if context is None: + raise DataJointError( + f"Attribute type <{adapter_name}> is not registered. " + "Use @dj.register_type to register custom types." + ) + try: adapter = context[adapter_name] except KeyError: - raise DataJointError("Attribute adapter '{adapter_name}' is not defined.".format(adapter_name=adapter_name)) - if not isinstance(adapter, AttributeAdapter): raise DataJointError( - "Attribute adapter '{adapter_name}' must be an instance of datajoint.AttributeAdapter".format( - adapter_name=adapter_name - ) + f"Attribute type <{adapter_name}> is not defined. " + "Register it with @dj.register_type or include it in the schema context." ) - if not isinstance(adapter.attribute_type, str) or not re.match(r"^\w", adapter.attribute_type): + + # Validate it's an AttributeType (or legacy AttributeAdapter) + if not isinstance(adapter, AttributeType): raise DataJointError( - "Invalid attribute type {type} in attribute adapter '{adapter_name}'".format( - type=adapter.attribute_type, adapter_name=adapter_name - ) + f"Attribute adapter '{adapter_name}' must be an instance of " + "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)" ) + + # For legacy adapters from context, store the name they were looked up by + if isinstance(adapter, AttributeAdapter): + adapter._type_name = adapter_name + + # Validate the dtype/attribute_type + dtype = adapter.dtype + if not isinstance(dtype, str) or not re.match(r"^\w", dtype): + raise DataJointError( + f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>" + ) + return adapter diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py new file mode 100644 index 000000000..ac524d926 --- /dev/null +++ b/src/datajoint/attribute_type.py @@ -0,0 +1,413 @@ +""" +Custom attribute type system for DataJoint. + +This module provides the AttributeType base class and registration mechanism +for creating custom data types that extend DataJoint's native type system. + +Custom types enable seamless integration of complex Python objects (like NumPy arrays, +graphs, or domain-specific structures) with DataJoint's relational storage. + +Example: + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph: nx.Graph) -> list: + return list(graph.edges) + + def decode(self, edges: list) -> nx.Graph: + return nx.Graph(edges) + + # Then use in table definitions: + class MyTable(dj.Manual): + definition = ''' + id : int + --- + data : + ''' +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from .errors import DataJointError + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__.split(".")[0]) + +# Global type registry - maps type_name to AttributeType instance +_type_registry: dict[str, AttributeType] = {} +_entry_points_loaded: bool = False + + +class AttributeType(ABC): + """ + Base class for custom DataJoint attribute types. + + Subclass this to create custom types that can be used in table definitions + with the ```` syntax. Custom types define bidirectional conversion + between Python objects and DataJoint's storage format. + + Attributes: + type_name: Unique identifier used in ```` syntax + dtype: Underlying DataJoint storage type + + Example: + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph): + return list(graph.edges) + + def decode(self, edges): + import networkx as nx + return nx.Graph(edges) + + The type can then be used in table definitions:: + + class Connectivity(dj.Manual): + definition = ''' + id : int + --- + graph_data : + ''' + """ + + @property + @abstractmethod + def type_name(self) -> str: + """ + Unique identifier for this type, used in table definitions as ````. + + This name must be unique across all registered types. It should be lowercase + with underscores (e.g., "graph", "zarr_array", "compressed_image"). + + Returns: + The type name string without angle brackets. + """ + ... + + @property + @abstractmethod + def dtype(self) -> str: + """ + The underlying DataJoint type used for storage. + + Can be: + - A native type: ``"longblob"``, ``"blob"``, ``"varchar(255)"``, ``"int"``, ``"json"`` + - An external type: ``"blob@store"``, ``"attach@store"`` + - The object type: ``"object"`` + - Another custom type: ``""`` (enables type chaining) + + Returns: + The storage type specification string. + """ + ... + + @abstractmethod + def encode(self, value: Any, *, key: dict | None = None) -> Any: + """ + Convert a Python object to the storable format. + + Called during INSERT operations to transform user-provided objects + into a format suitable for storage in the underlying ``dtype``. + + Args: + value: The Python object to store. + key: Primary key values as a dict. Available when the dtype uses + object storage and may be needed for path construction. + + Returns: + Value in the format expected by ``dtype``. For example: + - For ``dtype="longblob"``: any picklable Python object + - For ``dtype="object"``: path string or file-like object + - For ``dtype="varchar(N)"``: string + """ + ... + + @abstractmethod + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """ + Convert stored data back to a Python object. + + Called during FETCH operations to reconstruct the original Python + object from the stored format. + + Args: + stored: Data retrieved from storage. Type depends on ``dtype``: + - For ``"object"``: an ``ObjectRef`` handle + - For blob types: the unpacked Python object + - For native types: the native Python value (str, int, etc.) + key: Primary key values as a dict. + + Returns: + The reconstructed Python object. + """ + ... + + def validate(self, value: Any) -> None: + """ + Validate a value before encoding. + + Override this method to add type checking or domain constraints. + Called automatically before ``encode()`` during INSERT operations. + The default implementation accepts any value. + + Args: + value: The value to validate. + + Raises: + TypeError: If the value has an incompatible type. + ValueError: If the value fails domain validation. + """ + pass + + def default(self) -> Any: + """ + Return a default value for this type. + + Override if the type has a sensible default value. The default + implementation raises NotImplementedError, indicating no default exists. + + Returns: + The default value for this type. + + Raises: + NotImplementedError: If no default exists (the default behavior). + """ + raise NotImplementedError(f"No default value for type <{self.type_name}>") + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}(type_name={self.type_name!r}, dtype={self.dtype!r})>" + + +def register_type(cls: type[AttributeType]) -> type[AttributeType]: + """ + Register a custom attribute type with DataJoint. + + Can be used as a decorator or called directly. The type becomes available + for use in table definitions with the ```` syntax. + + Args: + cls: An AttributeType subclass to register. + + Returns: + The same class, unmodified (allows use as decorator). + + Raises: + DataJointError: If a type with the same name is already registered + by a different class. + TypeError: If cls is not an AttributeType subclass. + + Example: + As a decorator:: + + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + ... + + Or called directly:: + + dj.register_type(GraphType) + """ + if not isinstance(cls, type) or not issubclass(cls, AttributeType): + raise TypeError(f"register_type requires an AttributeType subclass, got {cls!r}") + + instance = cls() + name = instance.type_name + + if not isinstance(name, str) or not name: + raise DataJointError(f"type_name must be a non-empty string, got {name!r}") + + if name in _type_registry: + existing = _type_registry[name] + if type(existing) is not cls: + raise DataJointError( + f"Type <{name}> is already registered by " + f"{type(existing).__module__}.{type(existing).__name__}" + ) + # Same class registered twice - idempotent, no error + return cls + + _type_registry[name] = instance + logger.debug(f"Registered attribute type <{name}> from {cls.__module__}.{cls.__name__}") + return cls + + +def unregister_type(name: str) -> None: + """ + Remove a type from the registry. + + Primarily useful for testing. Use with caution in production code. + + Args: + name: The type_name to unregister. + + Raises: + DataJointError: If the type is not registered. + """ + name = name.strip("<>") + if name not in _type_registry: + raise DataJointError(f"Type <{name}> is not registered") + del _type_registry[name] + + +def get_type(name: str) -> AttributeType: + """ + Retrieve a registered attribute type by name. + + Looks up the type in the explicit registry first, then attempts + to load from installed packages via entry points. + + Args: + name: The type name, with or without angle brackets. + + Returns: + The registered AttributeType instance. + + Raises: + DataJointError: If the type is not found. + """ + name = name.strip("<>") + + # Check explicit registry first + if name in _type_registry: + return _type_registry[name] + + # Lazy-load entry points + _load_entry_points() + + if name in _type_registry: + return _type_registry[name] + + raise DataJointError( + f"Unknown attribute type: <{name}>. " + f"Ensure the type is registered via @dj.register_type or installed as a package." + ) + + +def list_types() -> list[str]: + """ + List all registered type names. + + Returns: + Sorted list of registered type names. + """ + _load_entry_points() + return sorted(_type_registry.keys()) + + +def is_type_registered(name: str) -> bool: + """ + Check if a type name is registered. + + Args: + name: The type name to check. + + Returns: + True if the type is registered. + """ + name = name.strip("<>") + if name in _type_registry: + return True + _load_entry_points() + return name in _type_registry + + +def _load_entry_points() -> None: + """ + Load attribute types from installed packages via entry points. + + Types are discovered from the ``datajoint.types`` entry point group. + Packages declare types in pyproject.toml:: + + [project.entry-points."datajoint.types"] + zarr_array = "dj_zarr:ZarrArrayType" + + This function is idempotent - entry points are only loaded once. + """ + global _entry_points_loaded + if _entry_points_loaded: + return + + _entry_points_loaded = True + + try: + from importlib.metadata import entry_points + except ImportError: + # Python < 3.10 fallback + try: + from importlib_metadata import entry_points + except ImportError: + logger.debug("importlib.metadata not available, skipping entry point discovery") + return + + try: + # Python 3.10+ / importlib_metadata 3.6+ + eps = entry_points(group="datajoint.types") + except TypeError: + # Older API + eps = entry_points().get("datajoint.types", []) + + for ep in eps: + if ep.name in _type_registry: + # Already registered explicitly, skip entry point + continue + try: + type_class = ep.load() + register_type(type_class) + logger.debug(f"Loaded attribute type <{ep.name}> from entry point {ep.value}") + except Exception as e: + logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}") + + +def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[AttributeType]]: + """ + Resolve a dtype string, following type chains. + + If dtype references another custom type (e.g., ""), recursively + resolves to find the ultimate storage type. + + Args: + dtype: The dtype string to resolve. + seen: Set of already-seen type names (for cycle detection). + + Returns: + Tuple of (final_storage_type, list_of_types_in_chain). + The chain is ordered from outermost to innermost type. + + Raises: + DataJointError: If a circular type reference is detected. + """ + if seen is None: + seen = set() + + chain: list[AttributeType] = [] + + # Check if dtype is a custom type reference + if dtype.startswith("<") and dtype.endswith(">"): + type_name = dtype[1:-1] + + if type_name in seen: + raise DataJointError(f"Circular type reference detected: <{type_name}>") + + seen.add(type_name) + attr_type = get_type(type_name) + chain.append(attr_type) + + # Recursively resolve the inner dtype + inner_dtype, inner_chain = resolve_dtype(attr_type.dtype, seen) + chain.extend(inner_chain) + return inner_dtype, chain + + # Not a custom type - return as-is + return dtype, chain diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index c1a22f0ca..995984389 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -480,8 +480,8 @@ def substitute_special_type(match, category, foreign_key_sql, context): "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match) ) elif category == "ADAPTED": - adapter = get_adapter(context, match["type"]) - match["type"] = adapter.attribute_type + attr_type = get_adapter(context, match["type"]) + match["type"] = attr_type.dtype category = match_type(match["type"]) if category in SPECIAL_TYPES: # recursive redefinition from user-defined datatypes. diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 5d02b52b0..0cac13632 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -53,8 +53,9 @@ def _get(connection, attr, data, squeeze, download_path): extern = connection.schemas[attr.database].external[attr.store] if attr.is_external else None - # apply attribute adapter if present - adapt = attr.adapter.get if attr.adapter else lambda x: x + # apply custom attribute type decoder if present + def adapt(x): + return attr.adapter.decode(x, key=None) if attr.adapter else x if attr.is_filepath: return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0]) diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 45e35998c..1e40451ee 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -5,7 +5,8 @@ import numpy as np -from .attribute_adapter import AttributeAdapter, get_adapter +from .attribute_adapter import get_adapter +from .attribute_type import AttributeType from .declare import ( EXTERNAL_TYPES, NATIVE_TYPES, @@ -15,6 +16,36 @@ ) from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types + +class _MissingType(AttributeType): + """Placeholder for missing/unregistered attribute types. Raises error on use.""" + + def __init__(self, name: str): + self._name = name + + @property + def type_name(self) -> str: + return self._name + + @property + def dtype(self) -> str: + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + def encode(self, value, *, key=None): + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + def decode(self, stored, *, key=None): + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + logger = logging.getLogger(__name__.split(".")[0]) default_attribute_properties = dict( # these default values are set in computed attributes @@ -279,7 +310,7 @@ def _init_from_database(self): if special: special = special.groupdict() attr.update(special) - # process adapted attribute types + # process custom attribute types (adapted types) if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): assert context is not None, "Declaration context is not set" adapter_name = special["type"] @@ -287,14 +318,12 @@ def _init_from_database(self): attr.update(adapter=get_adapter(context, adapter_name)) except DataJointError: # if no adapter, then delay the error until the first invocation - attr.update(adapter=AttributeAdapter()) + attr.update(adapter=_MissingType(adapter_name)) else: - attr.update(type=attr["adapter"].attribute_type) + attr.update(type=attr["adapter"].dtype) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): raise DataJointError( - "Invalid attribute type '{type}' in adapter object <{adapter_name}>.".format( - adapter_name=adapter_name, **attr - ) + f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>." ) special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES) diff --git a/src/datajoint/table.py b/src/datajoint/table.py index a8a52c3e0..20f579225 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -726,7 +726,9 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False): return None attr = self.heading[name] if attr.adapter: - value = attr.adapter.put(value) + # Custom attribute type: validate and encode + attr.adapter.validate(value) + value = attr.adapter.encode(value, key=None) if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))): # set default value placeholder, value = "DEFAULT", None diff --git a/tests/conftest.py b/tests/conftest.py index 8a6ba4057..37241de86 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,7 +16,6 @@ import datajoint as dj from datajoint.errors import ( - ADAPTED_TYPE_SWITCH, FILEPATH_FEATURE_SWITCH, DataJointError, ) @@ -334,10 +333,14 @@ def monkeymodule(): @pytest.fixture -def enable_adapted_types(monkeypatch): - monkeypatch.setenv(ADAPTED_TYPE_SWITCH, "TRUE") +def enable_adapted_types(): + """ + Deprecated fixture - custom attribute types no longer require a feature flag. + + This fixture is kept for backward compatibility but does nothing. + Custom types are now enabled by default via the AttributeType system. + """ yield - monkeypatch.delenv(ADAPTED_TYPE_SWITCH, raising=True) @pytest.fixture diff --git a/tests/test_adapted_attributes.py b/tests/test_adapted_attributes.py index 1060a50ed..0b4285ffb 100644 --- a/tests/test_adapted_attributes.py +++ b/tests/test_adapted_attributes.py @@ -1,3 +1,10 @@ +""" +Tests for adapted/custom attribute types. + +These tests use the legacy AttributeAdapter API for backward compatibility testing. +""" + +import warnings from itertools import zip_longest import networkx as nx @@ -8,6 +15,9 @@ from . import schema_adapted from .schema_adapted import Connectivity, Layout +# Filter deprecation warnings from legacy AttributeAdapter usage in these tests +pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") + @pytest.fixture def schema_name(prefix): @@ -16,24 +26,28 @@ def schema_name(prefix): @pytest.fixture def adapted_graph_instance(): - yield schema_adapted.GraphAdapter() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + yield schema_adapted.GraphAdapter() @pytest.fixture def schema_ad( connection_test, adapted_graph_instance, - enable_adapted_types, enable_filepath_feature, s3_creds, tmpdir, schema_name, ): dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + layout_adapter = schema_adapted.LayoutToFilepath() context = { **schema_adapted.LOCALS_ADAPTED, "graph": adapted_graph_instance, - "layout_to_filepath": schema_adapted.LayoutToFilepath(), + "layout_to_filepath": layout_adapter, } schema = dj.schema(schema_name, context=context, connection=connection_test) schema(schema_adapted.Connectivity) @@ -92,7 +106,7 @@ def test_adapted_filepath_type(schema_ad, minio_client): c.delete() -def test_adapted_spawned(local_schema, enable_adapted_types): +def test_adapted_spawned(local_schema): c = Connectivity() # a spawned class graphs = [ nx.lollipop_graph(4, 2), diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py new file mode 100644 index 000000000..294b7eee8 --- /dev/null +++ b/tests/test_attribute_type.py @@ -0,0 +1,347 @@ +""" +Tests for the new AttributeType system. +""" + +import pytest + +import datajoint as dj +from datajoint.attribute_type import ( + AttributeType, + _type_registry, + get_type, + is_type_registered, + list_types, + register_type, + resolve_dtype, + unregister_type, +) +from datajoint.errors import DataJointError + + +class TestAttributeTypeRegistry: + """Tests for the type registry functionality.""" + + def setup_method(self): + """Clear any test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_register_type_decorator(self): + """Test registering a type using the decorator.""" + + @register_type + class TestType(AttributeType): + type_name = "test_decorator" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_type_registered("test_decorator") + assert get_type("test_decorator").type_name == "test_decorator" + + def test_register_type_direct(self): + """Test registering a type by calling register_type directly.""" + + class TestType(AttributeType): + type_name = "test_direct" + dtype = "varchar(255)" + + def encode(self, value, *, key=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + register_type(TestType) + assert is_type_registered("test_direct") + + def test_register_type_idempotent(self): + """Test that registering the same type twice is idempotent.""" + + @register_type + class TestType(AttributeType): + type_name = "test_idempotent" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + # Second registration should not raise + register_type(TestType) + assert is_type_registered("test_idempotent") + + def test_register_duplicate_name_different_class(self): + """Test that registering different classes with same name raises error.""" + + @register_type + class TestType1(AttributeType): + type_name = "test_duplicate" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class TestType2(AttributeType): + type_name = "test_duplicate" + dtype = "varchar(100)" + + def encode(self, value, *, key=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="already registered"): + register_type(TestType2) + + def test_unregister_type(self): + """Test unregistering a type.""" + + @register_type + class TestType(AttributeType): + type_name = "test_unregister" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_type_registered("test_unregister") + unregister_type("test_unregister") + assert not is_type_registered("test_unregister") + + def test_get_type_not_found(self): + """Test that getting an unregistered type raises error.""" + with pytest.raises(DataJointError, match="Unknown attribute type"): + get_type("nonexistent_type") + + def test_list_types(self): + """Test listing registered types.""" + + @register_type + class TestType(AttributeType): + type_name = "test_list" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + types = list_types() + assert "test_list" in types + assert types == sorted(types) # Should be sorted + + def test_get_type_strips_brackets(self): + """Test that get_type accepts names with or without angle brackets.""" + + @register_type + class TestType(AttributeType): + type_name = "test_brackets" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert get_type("test_brackets") is get_type("") + + +class TestAttributeTypeValidation: + """Tests for the validate method.""" + + def setup_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_validate_called_default(self): + """Test that default validate accepts any value.""" + + @register_type + class TestType(AttributeType): + type_name = "test_validate_default" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + t = get_type("test_validate_default") + # Default validate should not raise for any value + t.validate(None) + t.validate(42) + t.validate("string") + t.validate([1, 2, 3]) + + def test_validate_custom(self): + """Test custom validation logic.""" + + @register_type + class PositiveIntType(AttributeType): + type_name = "test_positive_int" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + def validate(self, value): + if not isinstance(value, int): + raise TypeError(f"Expected int, got {type(value).__name__}") + if value < 0: + raise ValueError("Value must be positive") + + t = get_type("test_positive_int") + t.validate(42) # Should pass + + with pytest.raises(TypeError): + t.validate("not an int") + + with pytest.raises(ValueError): + t.validate(-1) + + +class TestTypeChaining: + """Tests for type chaining (dtype referencing another custom type).""" + + def setup_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_resolve_native_dtype(self): + """Test resolving a native dtype.""" + final_dtype, chain = resolve_dtype("longblob") + assert final_dtype == "longblob" + assert chain == [] + + def test_resolve_custom_dtype(self): + """Test resolving a custom dtype.""" + + @register_type + class TestType(AttributeType): + type_name = "test_resolve" + dtype = "varchar(100)" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain = resolve_dtype("") + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].type_name == "test_resolve" + + def test_resolve_chained_dtype(self): + """Test resolving a chained dtype.""" + + @register_type + class InnerType(AttributeType): + type_name = "test_inner" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class OuterType(AttributeType): + type_name = "test_outer" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain = resolve_dtype("") + assert final_dtype == "longblob" + assert len(chain) == 2 + assert chain[0].type_name == "test_outer" + assert chain[1].type_name == "test_inner" + + def test_circular_reference_detection(self): + """Test that circular type references are detected.""" + + @register_type + class TypeA(AttributeType): + type_name = "test_circular_a" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TypeB(AttributeType): + type_name = "test_circular_b" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="Circular type reference"): + resolve_dtype("") + + +class TestExportsAndAPI: + """Test that the public API is properly exported.""" + + def test_exports_from_datajoint(self): + """Test that AttributeType and helpers are exported from datajoint.""" + assert hasattr(dj, "AttributeType") + assert hasattr(dj, "register_type") + assert hasattr(dj, "list_types") + + def test_attribute_adapter_deprecated(self): + """Test that AttributeAdapter is still available but deprecated.""" + assert hasattr(dj, "AttributeAdapter") + # AttributeAdapter should be a subclass of AttributeType + assert issubclass(dj.AttributeAdapter, dj.AttributeType) From 055c9c6d4fa7ad7a75a576bff85211e8f27a62cd Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 02:21:30 +0000 Subject: [PATCH 02/42] Update documentation for new AttributeType system - Rewrite customtype.md with comprehensive documentation: - Overview of encode/decode pattern - Required components (type_name, dtype, encode, decode) - Type registration with @dj.register_type decorator - Validation with validate() method - Storage types (dtype options) - Type chaining for composable types - Key parameter for context-aware encoding - Entry point packages for distribution - Complete neuroscience example - Migration guide from AttributeAdapter - Best practices - Update attributes.md to reference custom types --- docs/src/design/tables/attributes.md | 4 + docs/src/design/tables/customtype.md | 474 ++++++++++++++++++++++++--- 2 files changed, 440 insertions(+), 38 deletions(-) diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 9363e527f..4f8a0644e 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -77,6 +77,10 @@ sending/receiving an opaque data file to/from a DataJoint pipeline. - `filepath@store`: a [filepath](filepath.md) used to link non-DataJoint managed files into a DataJoint pipeline. +- ``: a [custom attribute type](customtype.md) that defines bidirectional +conversion between Python objects and database storage formats. Use this to store +complex data types like graphs, domain-specific objects, or custom data structures. + ## Numeric type aliases DataJoint provides convenient type aliases that map to standard MySQL numeric types. diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index aad194ff5..43a168358 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -1,4 +1,4 @@ -# Custom Types +# Custom Attribute Types In modern scientific research, data pipelines often involve complex workflows that generate diverse data types. From high-dimensional imaging data to machine learning @@ -12,69 +12,467 @@ traditional relational databases. For example: + Computational biologists might store fitted machine learning models or parameter objects for downstream predictions. -To handle these diverse needs, DataJoint provides the `dj.AttributeAdapter` method. It +To handle these diverse needs, DataJoint provides the **AttributeType** system. It enables researchers to store and retrieve complex, non-standard data types—like Python objects or data structures—in a relational database while maintaining the reproducibility, modularity, and query capabilities required for scientific workflows. -## Uses in Scientific Research +## Overview -Imagine a neuroscience lab studying neural connectivity. Researchers might generate -graphs (e.g., networkx.Graph) to represent connections between brain regions, where: +Custom attribute types define bidirectional conversion between: -+ Nodes are brain regions. -+ Edges represent connections weighted by signal strength or another metric. +- **Python objects** (what your code works with) +- **Storage format** (what gets stored in the database) -Storing these graph objects in a database alongside other experimental data (e.g., -subject metadata, imaging parameters) ensures: - -1. Centralized Data Management: All experimental data and analysis results are stored - together for easy access and querying. -2. Reproducibility: The exact graph objects used in analysis can be retrieved later for - validation or further exploration. -3. Scalability: Graph data can be integrated into workflows for larger datasets or - across experiments. - -However, since graphs are not natively supported by relational databases, here’s where -`dj.AttributeAdapter` becomes essential. It allows researchers to define custom logic for -serializing graphs (e.g., as edge lists) and deserializing them back into Python -objects, bridging the gap between advanced data types and the database. +``` +┌─────────────────┐ encode() ┌─────────────────┐ +│ Python Object │ ───────────────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. blob) │ +└─────────────────┘ decode() └─────────────────┘ + ◄─────────────── +``` -### Example: Storing Graphs in DataJoint +## Defining Custom Types -To store a networkx.Graph object in a DataJoint table, researchers can define a custom -attribute type in a datajoint table class: +Create a custom type by subclassing `dj.AttributeType` and implementing the required +methods: ```python import datajoint as dj +import networkx as nx -class GraphAdapter(dj.AttributeAdapter): +@dj.register_type +class GraphType(dj.AttributeType): + """Custom type for storing networkx graphs.""" - attribute_type = 'longblob' # this is how the attribute will be declared + # Required: unique identifier used in table definitions + type_name = "graph" - def put(self, obj): - # convert the nx.Graph object into an edge list - assert isinstance(obj, nx.Graph) - return list(obj.edges) + # Required: underlying DataJoint storage type + dtype = "longblob" - def get(self, value): - # convert edge list back into an nx.Graph - return nx.Graph(value) + def encode(self, graph, *, key=None): + """Convert graph to storable format (called on INSERT).""" + return list(graph.edges) + def decode(self, edges, *, key=None): + """Convert stored data back to graph (called on FETCH).""" + return nx.Graph(edges) +``` -# instantiate for use as a datajoint type -graph = GraphAdapter() +### Required Components +| Component | Description | +|-----------|-------------| +| `type_name` | Unique identifier used in table definitions with `` syntax | +| `dtype` | Underlying DataJoint type for storage (e.g., `"longblob"`, `"varchar(255)"`, `"json"`) | +| `encode(value, *, key=None)` | Converts Python object to storable format | +| `decode(stored, *, key=None)` | Converts stored data back to Python object | -# define a table with a graph attribute -schema = dj.schema('test_graphs') +### Using Custom Types in Tables +Once registered, use the type in table definitions with angle brackets: +```python @schema class Connectivity(dj.Manual): definition = """ conn_id : int --- - conn_graph = null : # a networkx.Graph object + conn_graph = null : # Uses the GraphType we defined """ ``` + +Insert and fetch work seamlessly: + +```python +import networkx as nx + +# Insert - encode() is called automatically +g = nx.lollipop_graph(4, 2) +Connectivity.insert1({"conn_id": 1, "conn_graph": g}) + +# Fetch - decode() is called automatically +result = (Connectivity & "conn_id = 1").fetch1("conn_graph") +assert isinstance(result, nx.Graph) +``` + +## Type Registration + +### Decorator Registration + +The simplest way to register a type is with the `@dj.register_type` decorator: + +```python +@dj.register_type +class MyType(dj.AttributeType): + type_name = "my_type" + ... +``` + +### Direct Registration + +You can also register types explicitly: + +```python +class MyType(dj.AttributeType): + type_name = "my_type" + ... + +dj.register_type(MyType) +``` + +### Listing Registered Types + +```python +# List all registered type names +print(dj.list_types()) +``` + +## Validation + +Add data validation by overriding the `validate()` method. It's called automatically +before `encode()` during INSERT operations: + +```python +@dj.register_type +class PositiveArrayType(dj.AttributeType): + type_name = "positive_array" + dtype = "longblob" + + def validate(self, value): + """Ensure all values are positive.""" + import numpy as np + if not isinstance(value, np.ndarray): + raise TypeError(f"Expected numpy array, got {type(value).__name__}") + if np.any(value < 0): + raise ValueError("Array must contain only positive values") + + def encode(self, array, *, key=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## Storage Types (dtype) + +The `dtype` property specifies how data is stored in the database: + +| dtype | Use Case | Stored Format | +|-------|----------|---------------| +| `"longblob"` | Complex Python objects, arrays | Serialized binary | +| `"blob"` | Smaller objects | Serialized binary | +| `"json"` | JSON-serializable data | JSON string | +| `"varchar(N)"` | String representations | Text | +| `"int"` | Integer identifiers | Integer | +| `"blob@store"` | Large objects in external storage | UUID reference | +| `"object"` | Files/folders in object storage | JSON metadata | +| `""` | Chain to another custom type | Varies | + +### External Storage + +For large data, use external blob storage: + +```python +@dj.register_type +class LargeArrayType(dj.AttributeType): + type_name = "large_array" + dtype = "blob@mystore" # Uses external store named "mystore" + + def encode(self, array, *, key=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## Type Chaining + +Custom types can build on other custom types by referencing them in `dtype`: + +```python +@dj.register_type +class CompressedGraphType(dj.AttributeType): + type_name = "compressed_graph" + dtype = "" # Chain to the GraphType + + def encode(self, graph, *, key=None): + # Compress before passing to GraphType + return self._compress(graph) + + def decode(self, stored, *, key=None): + # GraphType's decode already ran + return self._decompress(stored) +``` + +DataJoint automatically resolves the chain to find the final storage type. + +## The Key Parameter + +The `key` parameter provides access to primary key values during encode/decode +operations. This is useful when the conversion depends on record context: + +```python +@dj.register_type +class ContextAwareType(dj.AttributeType): + type_name = "context_aware" + dtype = "longblob" + + def encode(self, value, *, key=None): + if key and key.get("version") == 2: + return self._encode_v2(value) + return self._encode_v1(value) + + def decode(self, stored, *, key=None): + if key and key.get("version") == 2: + return self._decode_v2(stored) + return self._decode_v1(stored) +``` + +## Publishing Custom Types as Packages + +Custom types can be distributed as installable packages using Python entry points. +This allows types to be automatically discovered when the package is installed. + +### Package Structure + +``` +dj-graph-types/ +├── pyproject.toml +└── src/ + └── dj_graph_types/ + ├── __init__.py + └── types.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-types" +version = "1.0.0" + +[project.entry-points."datajoint.types"] +graph = "dj_graph_types.types:GraphType" +weighted_graph = "dj_graph_types.types:WeightedGraphType" +``` + +### Type Implementation + +```python +# src/dj_graph_types/types.py +import datajoint as dj +import networkx as nx + +class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return list(graph.edges) + + def decode(self, edges, *, key=None): + return nx.Graph(edges) + +class WeightedGraphType(dj.AttributeType): + type_name = "weighted_graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return [(u, v, d) for u, v, d in graph.edges(data=True)] + + def decode(self, edges, *, key=None): + g = nx.Graph() + g.add_weighted_edges_from(edges) + return g +``` + +### Usage After Installation + +```bash +pip install dj-graph-types +``` + +```python +# Types are automatically available after package installation +@schema +class MyTable(dj.Manual): + definition = """ + id : int + --- + network : + weighted_network : + """ +``` + +## Complete Example + +Here's a complete example demonstrating custom types for a neuroscience workflow: + +```python +import datajoint as dj +import numpy as np + +# Configure DataJoint +dj.config["database.host"] = "localhost" +dj.config["database.user"] = "root" +dj.config["database.password"] = "password" + +# Define custom types +@dj.register_type +class SpikeTrainType(dj.AttributeType): + """Efficient storage for sparse spike timing data.""" + type_name = "spike_train" + dtype = "longblob" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) + + +@dj.register_type +class WaveformType(dj.AttributeType): + """Storage for spike waveform templates with metadata.""" + type_name = "waveform" + dtype = "longblob" + + def encode(self, waveform_dict, *, key=None): + return { + "data": waveform_dict["data"].astype(np.float32), + "sampling_rate": waveform_dict["sampling_rate"], + "channel_ids": list(waveform_dict["channel_ids"]), + } + + def decode(self, stored, *, key=None): + return { + "data": stored["data"].astype(np.float64), + "sampling_rate": stored["sampling_rate"], + "channel_ids": np.array(stored["channel_ids"]), + } + + +# Create schema and tables +schema = dj.schema("ephys_analysis") + +@schema +class Unit(dj.Manual): + definition = """ + unit_id : int + --- + spike_times : + waveform : + quality : enum('good', 'mua', 'noise') + """ + + +# Usage +spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) +waveform = { + "data": np.random.randn(82, 4), + "sampling_rate": 30000, + "channel_ids": [10, 11, 12, 13], +} + +Unit.insert1({ + "unit_id": 1, + "spike_times": spike_times, + "waveform": waveform, + "quality": "good", +}) + +# Fetch - automatically decoded +result = (Unit & "unit_id = 1").fetch1() +print(f"Spike times: {result['spike_times']}") +print(f"Waveform shape: {result['waveform']['data'].shape}") +``` + +## Migration from AttributeAdapter + +The `AttributeAdapter` class is deprecated. Migrate to `AttributeType`: + +### Before (deprecated) + +```python +class GraphAdapter(dj.AttributeAdapter): + attribute_type = "longblob" + + def put(self, obj): + return list(obj.edges) + + def get(self, value): + return nx.Graph(value) + +# Required context-based registration +graph = GraphAdapter() +schema = dj.schema("mydb", context={"graph": graph}) +``` + +### After (recommended) + +```python +@dj.register_type +class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, obj, *, key=None): + return list(obj.edges) + + def decode(self, value, *, key=None): + return nx.Graph(value) + +# Global registration - no context needed +schema = dj.schema("mydb") +``` + +### Key Differences + +| Aspect | AttributeAdapter (deprecated) | AttributeType (recommended) | +|--------|-------------------------------|----------------------------| +| Methods | `put()` / `get()` | `encode()` / `decode()` | +| Storage type | `attribute_type` | `dtype` | +| Type name | Variable name in context | `type_name` property | +| Registration | Context dict per schema | Global `@register_type` decorator | +| Validation | Manual | Built-in `validate()` method | +| Distribution | Copy adapter code | Entry point packages | +| Key access | Not available | Optional `key` parameter | + +## Best Practices + +1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) + +2. **Select appropriate storage types**: Use `longblob` for complex objects, `json` for simple structures, external storage for large data + +3. **Add validation**: Use `validate()` to catch data errors early + +4. **Document your types**: Include docstrings explaining the expected input/output formats + +5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes + +6. **Consider versioning**: If your encoding format might change, include version information + +7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs + +```python +def test_graph_type_roundtrip(): + g = nx.lollipop_graph(4, 2) + t = GraphType() + + encoded = t.encode(g) + decoded = t.decode(encoded) + + assert set(g.edges) == set(decoded.edges) +``` From af9bd8dfac0a3e11977ff813bef6865942a6e8ff Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 02:30:59 +0000 Subject: [PATCH 03/42] Apply ruff-format fixes to AttributeType implementation --- src/datajoint/attribute_adapter.py | 18 +++++------------- src/datajoint/attribute_type.py | 6 ++---- src/datajoint/heading.py | 5 ++--- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py index 5c687bff6..7e49abb5c 100644 --- a/src/datajoint/attribute_adapter.py +++ b/src/datajoint/attribute_adapter.py @@ -83,8 +83,7 @@ def dtype(self) -> str: attr_type = self.attribute_type if attr_type is None: raise NotImplementedError( - f"{self.__class__.__name__} must define 'attribute_type' " - "(or migrate to AttributeType with 'dtype')" + f"{self.__class__.__name__} must define 'attribute_type' " "(or migrate to AttributeType with 'dtype')" ) return attr_type @@ -109,9 +108,7 @@ def put(self, obj: Any) -> Any: Returns: Value to store in the database. """ - raise NotImplementedError( - f"{self.__class__.__name__} must implement put() or migrate to encode()" - ) + raise NotImplementedError(f"{self.__class__.__name__} must implement put() or migrate to encode()") def get(self, value: Any) -> Any: """ @@ -126,9 +123,7 @@ def get(self, value: Any) -> Any: Returns: Object of the adapted type. """ - raise NotImplementedError( - f"{self.__class__.__name__} must implement get() or migrate to decode()" - ) + raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()") def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: @@ -158,8 +153,7 @@ def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: # Fall back to context-based lookup (legacy system) if context is None: raise DataJointError( - f"Attribute type <{adapter_name}> is not registered. " - "Use @dj.register_type to register custom types." + f"Attribute type <{adapter_name}> is not registered. " "Use @dj.register_type to register custom types." ) try: @@ -184,8 +178,6 @@ def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: # Validate the dtype/attribute_type dtype = adapter.dtype if not isinstance(dtype, str) or not re.match(r"^\w", dtype): - raise DataJointError( - f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>" - ) + raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>") return adapter diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index ac524d926..31393b2a9 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -232,8 +232,7 @@ class GraphType(dj.AttributeType): existing = _type_registry[name] if type(existing) is not cls: raise DataJointError( - f"Type <{name}> is already registered by " - f"{type(existing).__module__}.{type(existing).__name__}" + f"Type <{name}> is already registered by " f"{type(existing).__module__}.{type(existing).__name__}" ) # Same class registered twice - idempotent, no error return cls @@ -290,8 +289,7 @@ def get_type(name: str) -> AttributeType: return _type_registry[name] raise DataJointError( - f"Unknown attribute type: <{name}>. " - f"Ensure the type is registered via @dj.register_type or installed as a package." + f"Unknown attribute type: <{name}>. " f"Ensure the type is registered via @dj.register_type or installed as a package." ) diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 1e40451ee..6b89b9eb1 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -46,6 +46,7 @@ def decode(self, stored, *, key=None): "Register it with @dj.register_type or include it in the schema context." ) + logger = logging.getLogger(__name__.split(".")[0]) default_attribute_properties = dict( # these default values are set in computed attributes @@ -322,9 +323,7 @@ def _init_from_database(self): else: attr.update(type=attr["adapter"].dtype) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): - raise DataJointError( - f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>." - ) + raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES) if special: From 9bd37f6675f5eaed047109a01979edb51e035c3a Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 02:52:28 +0000 Subject: [PATCH 04/42] Add DJBlobType and migration utilities for blob columns Introduces `` as an explicit AttributeType for DataJoint's native blob serialization, allowing users to be explicit about serialization behavior in table definitions. Key changes: - Add DJBlobType class with `serializes=True` flag to indicate it handles its own serialization (avoiding double pack/unpack) - Update table.py and fetch.py to respect the `serializes` flag, skipping blob.pack/unpack when adapter handles serialization - Add `dj.migrate` module with utilities for migrating existing schemas to use explicit `` type declarations - Add tests for DJBlobType functionality - Document `` type and migration procedure The migration is metadata-only - blob data format is unchanged. Existing `longblob` columns continue to work with implicit serialization for backward compatibility. --- docs/src/design/tables/customtype.md | 114 ++++++++++++ src/datajoint/__init__.py | 1 + src/datajoint/attribute_type.py | 125 ++++++++++++++ src/datajoint/fetch.py | 22 ++- src/datajoint/migrate.py | 249 +++++++++++++++++++++++++++ src/datajoint/table.py | 7 +- tests/test_attribute_type.py | 68 ++++++++ 7 files changed, 572 insertions(+), 14 deletions(-) create mode 100644 src/datajoint/migrate.py diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index 43a168358..4299df24d 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -476,3 +476,117 @@ def test_graph_type_roundtrip(): assert set(g.edges) == set(decoded.edges) ``` + +## Built-in Types + +DataJoint includes a built-in type for explicit blob serialization: + +### `` - DataJoint Blob Serialization + +The `` type provides explicit control over DataJoint's native binary +serialization. It supports: + +- NumPy arrays (compatible with MATLAB) +- Python dicts, lists, tuples, sets +- datetime objects, Decimals, UUIDs +- Nested data structures +- Optional compression + +```python +@schema +class ProcessedData(dj.Manual): + definition = """ + data_id : int + --- + results : # Explicit serialization + raw_bytes : longblob # Backward-compatible (auto-serialized) + """ +``` + +#### When to Use `` + +- **New tables**: Prefer `` for clarity and future-proofing +- **Custom types**: Use `` when your type chains to blob storage +- **Migration**: Existing `longblob` columns can be migrated to `` + +#### Backward Compatibility + +For backward compatibility, `longblob` columns without an explicit type +still receive automatic serialization. The behavior is identical to ``, +but using `` makes the serialization explicit in your code. + +## Schema Migration + +When upgrading existing schemas to use explicit type declarations, DataJoint +provides migration utilities. + +### Analyzing Blob Columns + +```python +import datajoint as dj + +schema = dj.schema("my_database") + +# Check migration status +status = dj.migrate.check_migration_status(schema) +print(f"Blob columns: {status['total_blob_columns']}") +print(f"Already migrated: {status['migrated']}") +print(f"Pending migration: {status['pending']}") +``` + +### Generating Migration SQL + +```python +# Preview migration (dry run) +result = dj.migrate.migrate_blob_columns(schema, dry_run=True) +for sql in result['sql_statements']: + print(sql) +``` + +### Applying Migration + +```python +# Apply migration +result = dj.migrate.migrate_blob_columns(schema, dry_run=False) +print(f"Migrated {result['migrated']} columns") +``` + +### Migration Details + +The migration updates MySQL column comments to include the type declaration. +This is a **metadata-only** change - the actual blob data format is unchanged. + +Before migration: +- Column: `longblob` +- Comment: `user comment` +- Behavior: Auto-serialization (implicit) + +After migration: +- Column: `longblob` +- Comment: `::user comment` +- Behavior: Explicit serialization via `` + +### Updating Table Definitions + +After database migration, update your Python table definitions for consistency: + +```python +# Before +class MyTable(dj.Manual): + definition = """ + id : int + --- + data : longblob # stored data + """ + +# After +class MyTable(dj.Manual): + definition = """ + id : int + --- + data : # stored data + """ +``` + +Both definitions work identically after migration, but using `` makes +the serialization explicit and documents the intended behavior. diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index feff400bf..0a8492cf1 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -58,6 +58,7 @@ ] from . import errors +from . import migrate from .admin import kill from .attribute_adapter import AttributeAdapter from .attribute_type import AttributeType, list_types, register_type diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index 31393b2a9..d9a890a83 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -153,6 +153,10 @@ def decode(self, stored: Any, *, key: dict | None = None) -> Any: """ ... + # Class attribute: If True, encode() produces final binary data (no blob.pack needed) + # Override in subclasses that handle their own serialization + serializes: bool = False + def validate(self, value: Any) -> None: """ Validate a value before encoding. @@ -409,3 +413,124 @@ def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[A # Not a custom type - return as-is return dtype, chain + + +# ============================================================================= +# Built-in Attribute Types +# ============================================================================= + + +class DJBlobType(AttributeType): + """ + Built-in type for DataJoint's native serialization format. + + This type handles serialization of arbitrary Python objects (including NumPy arrays, + dictionaries, lists, etc.) using DataJoint's binary blob format. The format includes: + + - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) + - Optional compression (zlib) + - Support for NumPy arrays, datetime objects, UUIDs, and nested structures + + The ```` type is the explicit way to specify DataJoint's serialization. + It stores data in a MySQL ``LONGBLOB`` column. + + Example: + @schema + class ProcessedData(dj.Manual): + definition = ''' + data_id : int + --- + results : # Explicit DataJoint serialization + raw_bytes : longblob # Raw bytes (no serialization) + ''' + + Note: + For backward compatibility, ``longblob`` columns without an explicit type + still use automatic serialization. Use ```` to be explicit about + serialization behavior. + """ + + type_name = "djblob" + dtype = "longblob" + serializes = True # This type handles its own serialization + + def encode(self, value: Any, *, key: dict | None = None) -> bytes: + """ + Serialize a Python object to DataJoint's blob format. + + Args: + value: Any serializable Python object (dict, list, numpy array, etc.) + key: Primary key values (unused for blob serialization). + + Returns: + Serialized bytes with protocol header and optional compression. + """ + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """ + Deserialize DataJoint blob format back to a Python object. + + Args: + stored: Serialized blob bytes. + key: Primary key values (unused for blob serialization). + + Returns: + The deserialized Python object. + """ + from . import blob + + return blob.unpack(stored, squeeze=False) + + +class DJBlobExternalType(AttributeType): + """ + Built-in type for externally-stored DataJoint blobs. + + Similar to ```` but stores data in external blob storage instead + of inline in the database. Useful for large objects. + + The store name is specified when defining the column type. + + Example: + @schema + class LargeData(dj.Manual): + definition = ''' + data_id : int + --- + large_array : blob@mystore # External storage with auto-serialization + ''' + """ + + # Note: This type isn't directly usable via syntax + # It's used internally when blob@store syntax is detected + type_name = "djblob_external" + dtype = "blob@store" # Placeholder - actual store is determined at declaration time + serializes = True # This type handles its own serialization + + def encode(self, value: Any, *, key: dict | None = None) -> bytes: + """Serialize a Python object to DataJoint's blob format.""" + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """Deserialize DataJoint blob format back to a Python object.""" + from . import blob + + return blob.unpack(stored, squeeze=False) + + +def _register_builtin_types() -> None: + """ + Register DataJoint's built-in attribute types. + + Called automatically during module initialization. + """ + register_type(DJBlobType) + + +# Register built-in types when module is loaded +_register_builtin_types() diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 0cac13632..4dfe42c12 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -88,18 +88,16 @@ def adapt(x): safe_write(local_filepath, data.split(b"\0", 1)[1]) return adapt(str(local_filepath)) # download file from remote store - return adapt( - uuid.UUID(bytes=data) - if attr.uuid - else ( - blob.unpack( - extern.get(uuid.UUID(bytes=data)) if attr.is_external else data, - squeeze=squeeze, - ) - if attr.is_blob - else data - ) - ) + if attr.uuid: + return adapt(uuid.UUID(bytes=data)) + elif attr.is_blob: + blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data + # Skip unpack if adapter handles its own deserialization + if attr.adapter and getattr(attr.adapter, "serializes", False): + return attr.adapter.decode(blob_data, key=None) + return adapt(blob.unpack(blob_data, squeeze=squeeze)) + else: + return adapt(data) class Fetch: diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py new file mode 100644 index 000000000..e463da93a --- /dev/null +++ b/src/datajoint/migrate.py @@ -0,0 +1,249 @@ +""" +Migration utilities for DataJoint schema updates. + +This module provides tools for migrating existing schemas to use the new +AttributeType system, particularly for upgrading blob columns to use +explicit `` type declarations. +""" + +from __future__ import annotations + +import logging +import re +from typing import TYPE_CHECKING + +from .errors import DataJointError + +if TYPE_CHECKING: + from .connection import Connection + from .schemas import Schema + +logger = logging.getLogger(__name__.split(".")[0]) + +# Pattern to detect blob types +BLOB_TYPES = re.compile(r"^(tiny|small|medium|long|)blob$", re.I) + + +def analyze_blob_columns(schema: Schema) -> list[dict]: + """ + Analyze a schema to find blob columns that could be migrated to . + + This function identifies blob columns that: + 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) + 2. Do NOT already have an adapter/type specified in their comment + + Args: + schema: The DataJoint schema to analyze. + + Returns: + List of dicts with keys: + - table_name: Full table name (database.table) + - column_name: Name of the blob column + - column_type: MySQL column type + - current_comment: Current column comment + - needs_migration: True if column should be migrated + + Example: + >>> import datajoint as dj + >>> schema = dj.schema('my_database') + >>> columns = dj.migrate.analyze_blob_columns(schema) + >>> for col in columns: + ... if col['needs_migration']: + ... print(f"{col['table_name']}.{col['column_name']}") + """ + results = [] + + connection = schema.connection + + # Get all tables in the schema + tables_query = """ + SELECT TABLE_NAME + FROM information_schema.TABLES + WHERE TABLE_SCHEMA = %s + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_NAME NOT LIKE '~%%' + """ + + tables = connection.query(tables_query, args=(schema.database,)).fetchall() + + for (table_name,) in tables: + # Get column information for each table + columns_query = """ + SELECT COLUMN_NAME, COLUMN_TYPE, COLUMN_COMMENT + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s + AND TABLE_NAME = %s + AND DATA_TYPE IN ('tinyblob', 'blob', 'mediumblob', 'longblob') + """ + + columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() + + for column_name, column_type, comment in columns: + # Check if comment already has an adapter type (starts with :type:) + has_adapter = comment and comment.startswith(":") + + results.append( + { + "table_name": f"{schema.database}.{table_name}", + "column_name": column_name, + "column_type": column_type, + "current_comment": comment or "", + "needs_migration": not has_adapter, + } + ) + + return results + + +def generate_migration_sql( + schema: Schema, + target_type: str = "djblob", + dry_run: bool = True, +) -> list[str]: + """ + Generate SQL statements to migrate blob columns to use . + + This generates ALTER TABLE statements that update column comments to + include the `::` prefix, marking them as using explicit + DataJoint blob serialization. + + Args: + schema: The DataJoint schema to migrate. + target_type: The type name to migrate to (default: "djblob"). + dry_run: If True, only return SQL without executing. + + Returns: + List of SQL ALTER TABLE statements. + + Example: + >>> sql_statements = dj.migrate.generate_migration_sql(schema) + >>> for sql in sql_statements: + ... print(sql) + + Note: + This is a metadata-only migration. The actual blob data format + remains unchanged - only the column comments are updated to + indicate explicit type handling. + """ + columns = analyze_blob_columns(schema) + sql_statements = [] + + for col in columns: + if not col["needs_migration"]: + continue + + # Build new comment with type prefix + old_comment = col["current_comment"] + new_comment = f":<{target_type}>:{old_comment}" + + # Escape special characters for SQL + new_comment_escaped = new_comment.replace("\\", "\\\\").replace("'", "\\'") + + # Parse table name + db_name, table_name = col["table_name"].split(".") + + # Generate ALTER TABLE statement + sql = ( + f"ALTER TABLE `{db_name}`.`{table_name}` " + f"MODIFY COLUMN `{col['column_name']}` {col['column_type']} " + f"COMMENT '{new_comment_escaped}'" + ) + sql_statements.append(sql) + + return sql_statements + + +def migrate_blob_columns( + schema: Schema, + target_type: str = "djblob", + dry_run: bool = True, +) -> dict: + """ + Migrate blob columns in a schema to use explicit type. + + This updates column comments in the database to include the type + declaration. The data format remains unchanged. + + Args: + schema: The DataJoint schema to migrate. + target_type: The type name to migrate to (default: "djblob"). + dry_run: If True, only preview changes without applying. + + Returns: + Dict with keys: + - analyzed: Number of blob columns analyzed + - needs_migration: Number of columns that need migration + - migrated: Number of columns migrated (0 if dry_run) + - sql_statements: List of SQL statements (executed or to be executed) + + Example: + >>> # Preview migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=True) + >>> print(f"Would migrate {result['needs_migration']} columns") + + >>> # Apply migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=False) + >>> print(f"Migrated {result['migrated']} columns") + + Warning: + After migration, table definitions should be updated to use + `` instead of `longblob` for consistency. The migration + only updates database metadata; source code changes are manual. + """ + columns = analyze_blob_columns(schema) + sql_statements = generate_migration_sql(schema, target_type=target_type) + + result = { + "analyzed": len(columns), + "needs_migration": sum(1 for c in columns if c["needs_migration"]), + "migrated": 0, + "sql_statements": sql_statements, + } + + if dry_run: + logger.info(f"Dry run: would migrate {result['needs_migration']} columns") + for sql in sql_statements: + logger.info(f" {sql}") + return result + + # Execute migrations + connection = schema.connection + for sql in sql_statements: + try: + connection.query(sql) + result["migrated"] += 1 + logger.info(f"Executed: {sql}") + except Exception as e: + logger.error(f"Failed to execute: {sql}\nError: {e}") + raise DataJointError(f"Migration failed: {e}") from e + + logger.info(f"Successfully migrated {result['migrated']} columns") + return result + + +def check_migration_status(schema: Schema) -> dict: + """ + Check the migration status of blob columns in a schema. + + Args: + schema: The DataJoint schema to check. + + Returns: + Dict with keys: + - total_blob_columns: Total number of blob columns + - migrated: Number of columns with explicit type + - pending: Number of columns using implicit serialization + - columns: List of column details + + Example: + >>> status = dj.migrate.check_migration_status(schema) + >>> print(f"Migration progress: {status['migrated']}/{status['total_blob_columns']}") + """ + columns = analyze_blob_columns(schema) + + return { + "total_blob_columns": len(columns), + "migrated": sum(1 for c in columns if not c["needs_migration"]), + "pending": sum(1 for c in columns if c["needs_migration"]), + "columns": columns, + } diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 20f579225..89050bce1 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -742,8 +742,11 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False): raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name)) value = value.bytes elif attr.is_blob: - value = blob.pack(value) - value = self.external[attr.store].put(value).bytes if attr.is_external else value + # Skip blob.pack if adapter already handles serialization + if not (attr.adapter and getattr(attr.adapter, "serializes", False)): + value = blob.pack(value) + if attr.is_external: + value = self.external[attr.store].put(value).bytes elif attr.is_attachment: attachment_path = Path(value) if attr.is_external: diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py index 294b7eee8..9fc7cd86f 100644 --- a/tests/test_attribute_type.py +++ b/tests/test_attribute_type.py @@ -345,3 +345,71 @@ def test_attribute_adapter_deprecated(self): assert hasattr(dj, "AttributeAdapter") # AttributeAdapter should be a subclass of AttributeType assert issubclass(dj.AttributeAdapter, dj.AttributeType) + + +class TestDJBlobType: + """Tests for the built-in DJBlobType.""" + + def test_djblob_is_registered(self): + """Test that djblob is automatically registered.""" + assert is_type_registered("djblob") + + def test_djblob_properties(self): + """Test DJBlobType properties.""" + blob_type = get_type("djblob") + assert blob_type.type_name == "djblob" + assert blob_type.dtype == "longblob" + assert blob_type.serializes is True + + def test_djblob_encode_decode_roundtrip(self): + """Test that encode/decode is a proper roundtrip.""" + import numpy as np + + blob_type = get_type("djblob") + + # Test with various data types + test_data = [ + {"key": "value", "number": 42}, + [1, 2, 3, 4, 5], + np.array([1.0, 2.0, 3.0]), + "simple string", + (1, 2, 3), + None, + ] + + for original in test_data: + encoded = blob_type.encode(original) + assert isinstance(encoded, bytes) + decoded = blob_type.decode(encoded) + if isinstance(original, np.ndarray): + np.testing.assert_array_equal(decoded, original) + else: + assert decoded == original + + def test_djblob_encode_produces_valid_blob_format(self): + """Test that encoded data has valid blob protocol header.""" + blob_type = get_type("djblob") + encoded = blob_type.encode({"test": "data"}) + + # Should start with compression prefix or protocol header + valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") + assert any(encoded.startswith(p) for p in valid_prefixes) + + def test_djblob_in_list_types(self): + """Test that djblob appears in list_types.""" + types = list_types() + assert "djblob" in types + + def test_serializes_flag_prevents_double_pack(self): + """Test that serializes=True prevents blob.pack being called twice. + + This is a unit test for the flag itself. Integration test with tables + is in test_blob.py or test_adapted_attributes.py. + """ + blob_type = get_type("djblob") + assert blob_type.serializes is True + + # Legacy adapters should not have serializes=True + # (they rely on blob.pack being called after encode) + # AttributeType base class defaults to False + assert AttributeType.serializes is False From c8d8a22d8251bc4730f48baa5036c16363201a3e Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 02:57:52 +0000 Subject: [PATCH 05/42] Clarify migration handles all blob type variants --- docs/src/design/tables/customtype.md | 6 ++++-- src/datajoint/migrate.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index 4299df24d..4a8a9ae06 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -556,13 +556,15 @@ print(f"Migrated {result['migrated']} columns") The migration updates MySQL column comments to include the type declaration. This is a **metadata-only** change - the actual blob data format is unchanged. +All blob type variants are handled: `tinyblob`, `blob`, `mediumblob`, `longblob`. + Before migration: -- Column: `longblob` +- Column: `longblob` (or `blob`, `mediumblob`, etc.) - Comment: `user comment` - Behavior: Auto-serialization (implicit) After migration: -- Column: `longblob` +- Column: `longblob` (unchanged) - Comment: `::user comment` - Behavior: Explicit serialization via `` diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index e463da93a..b7c707d3e 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -32,6 +32,8 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) 2. Do NOT already have an adapter/type specified in their comment + All blob size variants are included in the analysis. + Args: schema: The DataJoint schema to analyze. @@ -39,7 +41,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: List of dicts with keys: - table_name: Full table name (database.table) - column_name: Name of the blob column - - column_type: MySQL column type + - column_type: MySQL column type (tinyblob, blob, mediumblob, longblob) - current_comment: Current column comment - needs_migration: True if column should be migrated @@ -49,7 +51,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: >>> columns = dj.migrate.analyze_blob_columns(schema) >>> for col in columns: ... if col['needs_migration']: - ... print(f"{col['table_name']}.{col['column_name']}") + ... print(f"{col['table_name']}.{col['column_name']} ({col['column_type']})") """ results = [] From 61db015f5065862ea420b09b4c51518d86defa0c Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 03:03:17 +0000 Subject: [PATCH 06/42] Fix ruff linter errors: add migrate to __all__, remove unused import --- src/datajoint/__init__.py | 1 + src/datajoint/migrate.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 0a8492cf1..ef9e59cb0 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -50,6 +50,7 @@ "list_types", "AttributeAdapter", # Deprecated, use AttributeType "errors", + "migrate", "DataJointError", "key", "key_hash", diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index b7c707d3e..696ca380e 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -15,7 +15,6 @@ from .errors import DataJointError if TYPE_CHECKING: - from .connection import Connection from .schemas import Schema logger = logging.getLogger(__name__.split(".")[0]) From 78e0d1dc94fb0ba7ca70c9897e64a45158ce8030 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 03:22:20 +0000 Subject: [PATCH 07/42] Remove serializes flag; longblob is now raw bytes Simplified design: - Plain longblob columns store/return raw bytes (no serialization) - type handles serialization via encode/decode - Legacy AttributeAdapter handles blob pack/unpack internally for backward compatibility This eliminates the need for the serializes flag by making blob serialization the responsibility of the adapter/type, not the framework. Migration to is now required for existing schemas that rely on implicit serialization. --- docs/src/design/tables/customtype.md | 38 +++++++++++++++++++++------- src/datajoint/attribute_adapter.py | 34 ++++++++++++++++++++++--- src/datajoint/attribute_type.py | 15 ++++------- src/datajoint/fetch.py | 7 ++--- src/datajoint/table.py | 5 ++-- tests/test_attribute_type.py | 24 ++++++++++-------- 6 files changed, 85 insertions(+), 38 deletions(-) diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index 4a8a9ae06..7504d5d23 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -498,22 +498,42 @@ class ProcessedData(dj.Manual): definition = """ data_id : int --- - results : # Explicit serialization - raw_bytes : longblob # Backward-compatible (auto-serialized) + results : # Serialized Python objects + raw_bytes : longblob # Raw bytes (no serialization) """ ``` #### When to Use `` -- **New tables**: Prefer `` for clarity and future-proofing -- **Custom types**: Use `` when your type chains to blob storage -- **Migration**: Existing `longblob` columns can be migrated to `` +- **Serialized data**: When storing Python objects (dicts, arrays, etc.) +- **New tables**: Prefer `` for automatic serialization +- **Migration**: Existing schemas with implicit serialization must migrate -#### Backward Compatibility +#### Raw Blob Behavior -For backward compatibility, `longblob` columns without an explicit type -still receive automatic serialization. The behavior is identical to ``, -but using `` makes the serialization explicit in your code. +Plain `longblob` (and other blob variants) columns now store and return +**raw bytes** without automatic serialization: + +```python +@schema +class RawData(dj.Manual): + definition = """ + id : int + --- + raw_bytes : longblob # Stores/returns raw bytes + serialized : # Stores Python objects with serialization + """ + +# Raw bytes - no serialization +RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}}) + +row = (RawData & "id=1").fetch1() +row["raw_bytes"] # Returns: b"raw binary data" +row["serialized"] # Returns: {"key": "value"} +``` + +**Important**: Existing schemas that relied on implicit blob serialization +must be migrated to `` to preserve their behavior. ## Schema Migration diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py index 7e49abb5c..7df566a58 100644 --- a/src/datajoint/attribute_adapter.py +++ b/src/datajoint/attribute_adapter.py @@ -15,6 +15,9 @@ from .attribute_type import AttributeType, get_type, is_type_registered from .errors import DataJointError +# Pattern to detect blob types for internal pack/unpack +_BLOB_PATTERN = re.compile(r"^(tiny|small|medium|long|)blob", re.I) + class AttributeAdapter(AttributeType): """ @@ -87,12 +90,37 @@ def dtype(self) -> str: ) return attr_type + def _is_blob_dtype(self) -> bool: + """Check if dtype is a blob type requiring pack/unpack.""" + return bool(_BLOB_PATTERN.match(self.dtype)) + def encode(self, value: Any, *, key: dict | None = None) -> Any: - """Delegate to legacy put() method.""" - return self.put(value) + """ + Delegate to legacy put() method, with blob packing if needed. + + Legacy adapters expect blob.pack to be called after put() when + the dtype is a blob type. This wrapper handles that automatically. + """ + result = self.put(value) + # Legacy adapters expect blob.pack after put() for blob dtypes + if self._is_blob_dtype(): + from . import blob + + result = blob.pack(result) + return result def decode(self, stored: Any, *, key: dict | None = None) -> Any: - """Delegate to legacy get() method.""" + """ + Delegate to legacy get() method, with blob unpacking if needed. + + Legacy adapters expect blob.unpack to be called before get() when + the dtype is a blob type. This wrapper handles that automatically. + """ + # Legacy adapters expect blob.unpack before get() for blob dtypes + if self._is_blob_dtype(): + from . import blob + + stored = blob.unpack(stored) return self.get(stored) def put(self, obj: Any) -> Any: diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index d9a890a83..9be2d2214 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -153,10 +153,6 @@ def decode(self, stored: Any, *, key: dict | None = None) -> Any: """ ... - # Class attribute: If True, encode() produces final binary data (no blob.pack needed) - # Override in subclasses that handle their own serialization - serializes: bool = False - def validate(self, value: Any) -> None: """ Validate a value before encoding. @@ -440,19 +436,19 @@ class ProcessedData(dj.Manual): definition = ''' data_id : int --- - results : # Explicit DataJoint serialization + results : # Serialized Python objects raw_bytes : longblob # Raw bytes (no serialization) ''' Note: - For backward compatibility, ``longblob`` columns without an explicit type - still use automatic serialization. Use ```` to be explicit about - serialization behavior. + Plain ``longblob`` columns store and return raw bytes without serialization. + Use ```` when you need automatic serialization of Python objects. + Existing schemas using implicit blob serialization should migrate to ```` + using ``dj.migrate.migrate_blob_columns()``. """ type_name = "djblob" dtype = "longblob" - serializes = True # This type handles its own serialization def encode(self, value: Any, *, key: dict | None = None) -> bytes: """ @@ -508,7 +504,6 @@ class LargeData(dj.Manual): # It's used internally when blob@store syntax is detected type_name = "djblob_external" dtype = "blob@store" # Placeholder - actual store is determined at declaration time - serializes = True # This type handles its own serialization def encode(self, value: Any, *, key: dict | None = None) -> bytes: """Serialize a Python object to DataJoint's blob format.""" diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 4dfe42c12..73057938d 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -92,10 +92,11 @@ def adapt(x): return adapt(uuid.UUID(bytes=data)) elif attr.is_blob: blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data - # Skip unpack if adapter handles its own deserialization - if attr.adapter and getattr(attr.adapter, "serializes", False): + # Adapters (like ) handle deserialization in decode() + # Without adapter, blob columns return raw bytes (no deserialization) + if attr.adapter: return attr.adapter.decode(blob_data, key=None) - return adapt(blob.unpack(blob_data, squeeze=squeeze)) + return blob_data # raw bytes else: return adapt(data) diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 89050bce1..52ad32e71 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -742,9 +742,8 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False): raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name)) value = value.bytes elif attr.is_blob: - # Skip blob.pack if adapter already handles serialization - if not (attr.adapter and getattr(attr.adapter, "serializes", False)): - value = blob.pack(value) + # Adapters (like ) handle serialization in encode() + # Without adapter, blob columns store raw bytes (no serialization) if attr.is_external: value = self.external[attr.store].put(value).bytes elif attr.is_attachment: diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py index 9fc7cd86f..f8f822a60 100644 --- a/tests/test_attribute_type.py +++ b/tests/test_attribute_type.py @@ -359,7 +359,6 @@ def test_djblob_properties(self): blob_type = get_type("djblob") assert blob_type.type_name == "djblob" assert blob_type.dtype == "longblob" - assert blob_type.serializes is True def test_djblob_encode_decode_roundtrip(self): """Test that encode/decode is a proper roundtrip.""" @@ -400,16 +399,21 @@ def test_djblob_in_list_types(self): types = list_types() assert "djblob" in types - def test_serializes_flag_prevents_double_pack(self): - """Test that serializes=True prevents blob.pack being called twice. + def test_djblob_handles_serialization(self): + """Test that DJBlobType handles serialization internally. - This is a unit test for the flag itself. Integration test with tables - is in test_blob.py or test_adapted_attributes.py. + With the new design: + - Plain longblob columns store/return raw bytes (no serialization) + - handles pack/unpack in encode/decode + - Legacy AttributeAdapter handles pack/unpack internally for backward compat """ blob_type = get_type("djblob") - assert blob_type.serializes is True - # Legacy adapters should not have serializes=True - # (they rely on blob.pack being called after encode) - # AttributeType base class defaults to False - assert AttributeType.serializes is False + # DJBlobType.encode() should produce packed bytes + data = {"key": "value"} + encoded = blob_type.encode(data) + assert isinstance(encoded, bytes) + + # DJBlobType.decode() should unpack back to original + decoded = blob_type.decode(encoded) + assert decoded == data From c17335674e0041b1ff5b6397e0d87f7d11eef18d Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 03:25:36 +0000 Subject: [PATCH 08/42] Remove unused blob imports from fetch.py and table.py --- src/datajoint/fetch.py | 2 +- src/datajoint/table.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 73057938d..147e70b7b 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -10,7 +10,7 @@ from datajoint.condition import Top -from . import blob, hash +from . import hash from .errors import DataJointError from .settings import config from .utils import safe_write diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 52ad32e71..f3722fdde 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -12,7 +12,6 @@ import numpy as np import pandas -from . import blob from .condition import make_condition from .declare import alter, declare from .errors import ( From 106f859280e956a41bdf5abc04beeb4298d2f979 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 03:29:42 +0000 Subject: [PATCH 09/42] Update docs: use for serialized data, longblob for raw bytes --- docs/src/compute/key-source.md | 2 +- docs/src/compute/make.md | 4 ++-- docs/src/compute/populate.md | 6 +++--- docs/src/design/integrity.md | 2 +- docs/src/design/tables/attributes.md | 9 +++++++-- docs/src/design/tables/customtype.md | 2 +- docs/src/design/tables/master-part.md | 6 +++--- 7 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docs/src/compute/key-source.md b/docs/src/compute/key-source.md index 76796ec0c..c9b5d2ce7 100644 --- a/docs/src/compute/key-source.md +++ b/docs/src/compute/key-source.md @@ -45,7 +45,7 @@ definition = """ -> Recording --- sample_rate : float -eeg_data : longblob +eeg_data : """ key_source = Recording & 'recording_type = "EEG"' ``` diff --git a/docs/src/compute/make.md b/docs/src/compute/make.md index 1b5569b65..390be3b7b 100644 --- a/docs/src/compute/make.md +++ b/docs/src/compute/make.md @@ -152,7 +152,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ @@ -188,7 +188,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ diff --git a/docs/src/compute/populate.md b/docs/src/compute/populate.md index 45c863f17..91db7b176 100644 --- a/docs/src/compute/populate.md +++ b/docs/src/compute/populate.md @@ -40,7 +40,7 @@ class FilteredImage(dj.Computed): # Filtered image -> Image --- - filtered_image : longblob + filtered_image : """ def make(self, key): @@ -196,7 +196,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ @@ -230,7 +230,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ diff --git a/docs/src/design/integrity.md b/docs/src/design/integrity.md index cb7122755..393103522 100644 --- a/docs/src/design/integrity.md +++ b/docs/src/design/integrity.md @@ -142,7 +142,7 @@ definition = """ -> EEGRecording channel_idx : int --- -channel_data : longblob +channel_data : """ ``` ![doc_1-many](../images/doc_1-many.png){: style="align:center"} diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 4f8a0644e..c849e85ba 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -48,9 +48,10 @@ fractional digits. Because of its well-defined precision, `decimal` values can be used in equality comparison and be included in primary keys. -- `longblob`: arbitrary numeric array (e.g. matrix, image, structure), up to 4 +- `longblob`: raw binary data, up to 4 [GiB](http://en.wikipedia.org/wiki/Gibibyte) in size. - Numeric arrays are compatible between MATLAB and Python (NumPy). + Stores and returns raw bytes without serialization. + For serialized Python objects (arrays, dicts, etc.), use `` instead. The `longblob` and other `blob` datatypes can be configured to store data [externally](../../sysadmin/external-store.md) by using the `blob@store` syntax. @@ -71,6 +72,10 @@ info). These types abstract certain kinds of non-database data to facilitate use together with DataJoint. +- ``: DataJoint's native serialization format for Python objects. Supports +NumPy arrays, dicts, lists, datetime objects, and nested structures. Compatible with +MATLAB. See [custom types](customtype.md) for details. + - `attach`: a [file attachment](attach.md) similar to email attachments facillitating sending/receiving an opaque data file to/from a DataJoint pipeline. diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index 7504d5d23..267e0420b 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -454,7 +454,7 @@ schema = dj.schema("mydb") 1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) -2. **Select appropriate storage types**: Use `longblob` for complex objects, `json` for simple structures, external storage for large data +2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, external storage for large data 3. **Add validation**: Use `validate()` to catch data errors early diff --git a/docs/src/design/tables/master-part.md b/docs/src/design/tables/master-part.md index 629bfb8ab..d0f575e4d 100644 --- a/docs/src/design/tables/master-part.md +++ b/docs/src/design/tables/master-part.md @@ -26,8 +26,8 @@ class Segmentation(dj.Computed): -> Segmentation roi : smallint # roi number --- - roi_pixels : longblob # indices of pixels - roi_weights : longblob # weights of pixels + roi_pixels : # indices of pixels + roi_weights : # weights of pixels """ def make(self, key): @@ -101,7 +101,7 @@ definition = """ -> ElectrodeResponse channel: int --- -response: longblob # response of a channel +response: # response of a channel """ ``` From cab10f69af8ed9df314ce7d2acdd4a3d2f59c59d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:01:22 +0000 Subject: [PATCH 10/42] Add storage types redesign spec Design document for reimplementing blob, attach, filepath, and object types as a coherent AttributeType system. Separates storage location (@store) from encoding behavior. --- docs/src/design/tables/storage-types-spec.md | 363 +++++++++++++++++++ 1 file changed, 363 insertions(+) create mode 100644 docs/src/design/tables/storage-types-spec.md diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md new file mode 100644 index 000000000..2247164d2 --- /dev/null +++ b/docs/src/design/tables/storage-types-spec.md @@ -0,0 +1,363 @@ +# Storage Types Redesign Spec + +## Overview + +This document proposes a redesign of DataJoint's storage types (`blob`, `attach`, `filepath`, `object`) as a coherent system built on the `AttributeType` base class. + +## Current State Analysis + +### Existing Types + +| Type | DB Column | Storage | Semantics | +|------|-----------|---------|-----------| +| `longblob` | LONGBLOB | Internal | Raw bytes | +| `blob@store` | binary(16) | External | Raw bytes via UUID | +| `attach` | LONGBLOB | Internal | `filename\0contents` | +| `attach@store` | binary(16) | External | File via UUID | +| `filepath@store` | binary(16) | External | Path-addressed file reference | +| `object` | JSON | External | Managed file/folder with ObjectRef | + +### Problems with Current Design + +1. **Scattered implementation**: Logic split across `declare.py`, `table.py`, `fetch.py`, `external.py` +2. **Inconsistent patterns**: Some types use AttributeType, others are hardcoded +3. **Implicit behaviors**: `longblob` previously auto-serialized, now raw +4. **Overlapping semantics**: `blob@store` vs `attach@store` unclear +5. **No internal object type**: `object` always requires external store + +## Proposed Architecture + +### Core Concepts + +1. **Storage Location** (orthogonal to type): + - **Internal**: Data stored directly in database column + - **External**: Data stored in external storage, UUID reference in database + +2. **Content Model** (what the type represents): + - **Binary**: Raw bytes (no interpretation) + - **Serialized**: Python objects encoded via DJ blob format + - **File**: Single file with filename metadata + - **Folder**: Directory structure + - **Reference**: Pointer to externally-managed file (path-addressed) + +3. **AttributeType** handles encoding/decoding between Python values and stored representation + +### Type Hierarchy + +``` + AttributeType (base) + │ + ┌─────────────────┼─────────────────┐ + │ │ │ + BinaryType SerializedType FileSystemType + (passthrough) (pack/unpack) │ + │ │ ┌──────┴──────┐ + │ │ │ │ + longblob + longblob@store filepath@store +``` + +### Proposed Types + +#### 1. Raw Binary (`longblob`, `blob`, etc.) + +**Not an AttributeType** - these are primitive MySQL types. + +- Store/return raw bytes without transformation +- `@store` variant stores externally with content-addressed UUID +- No encoding/decoding needed + +```python +# Table definition +class RawData(dj.Manual): + definition = """ + id : int + --- + data : longblob # raw bytes in DB + large_data : blob@store # raw bytes externally + """ + +# Usage +table.insert1({'id': 1, 'data': b'raw bytes', 'large_data': b'large raw bytes'}) +row = (table & 'id=1').fetch1() +assert row['data'] == b'raw bytes' # bytes returned +``` + +#### 2. Serialized Objects (``) + +**AttributeType** with DJ blob serialization. + +- Input: Any Python object (arrays, dicts, lists, etc.) +- Output: Same Python object reconstructed +- Storage: DJ blob format (mYm/dj0 protocol) + +```python +@dj.register_type +class DJBlobType(AttributeType): + type_name = "djblob" + dtype = "longblob" # or "longblob@store" for external + + def encode(self, value, *, key=None) -> bytes: + return blob.pack(value, compress=True) + + def decode(self, stored, *, key=None) -> Any: + return blob.unpack(stored) +``` + +```python +# Table definition +class ProcessedData(dj.Manual): + definition = """ + id : int + --- + result : # serialized in DB + large_result : # serialized externally + """ + +# Usage +table.insert1({'id': 1, 'result': {'array': np.array([1,2,3]), 'meta': 'info'}}) +row = (table & 'id=1').fetch1() +assert row['result']['meta'] == 'info' # Python dict returned +``` + +#### 3. File Attachments (``) + +**AttributeType** for file storage with filename preservation. + +- Input: File path (string or Path) +- Output: Local file path after download +- Storage: File contents with filename metadata + +```python +@dj.register_type +class AttachType(AttributeType): + type_name = "attach" + dtype = "longblob" # or "longblob@store" for external + + # For internal storage + def encode(self, filepath, *, key=None) -> bytes: + path = Path(filepath) + return path.name.encode() + b"\0" + path.read_bytes() + + def decode(self, stored, *, key=None) -> str: + filename, contents = stored.split(b"\0", 1) + # Download to configured path, return local filepath + ... +``` + +**Key difference from blob**: Preserves original filename, returns file path not bytes. + +```python +# Table definition +class Attachments(dj.Manual): + definition = """ + id : int + --- + config_file : # small file in DB + data_file : # large file externally + """ + +# Usage +table.insert1({'id': 1, 'config_file': '/path/to/config.yaml'}) +row = (table & 'id=1').fetch1() +# row['config_file'] == '/downloads/config.yaml' # local path +``` + +#### 4. Filepath References (``) + +**AttributeType** for tracking externally-managed files. + +- Input: File path in staging area +- Output: Local file path after sync +- Storage: Path-addressed (UUID = hash of relative path, not contents) +- Tracks `contents_hash` separately for verification + +```python +@dj.register_type +class FilepathType(AttributeType): + type_name = "filepath" + dtype = "binary(16)" # Always external (UUID reference) + requires_store = True # Must specify @store + + def encode(self, filepath, *, key=None) -> bytes: + # Compute UUID from relative path + # Track contents_hash separately + ... + + def decode(self, uuid_bytes, *, key=None) -> str: + # Sync file from remote to local stage + # Verify contents_hash + # Return local path + ... +``` + +**Key difference from attach**: +- Path-addressed (same path = same UUID, even if contents change) +- Designed for managed file workflows where files may be updated +- Always external (no internal variant) + +```python +# Table definition +class ManagedFiles(dj.Manual): + definition = """ + id : int + --- + data_path : + """ + +# Usage - file must be in configured stage directory +table.insert1({'id': 1, 'data_path': '/stage/experiment_001/data.h5'}) +row = (table & 'id=1').fetch1() +# row['data_path'] == '/local_stage/experiment_001/data.h5' +``` + +#### 5. Managed Objects (``) + +**AttributeType** for managed file/folder storage with lazy access. + +- Input: File path, folder path, or ObjectRef +- Output: ObjectRef handle (lazy - no automatic download) +- Storage: JSON metadata column +- Supports direct writes (Zarr, HDF5) via fsspec + +```python +@dj.register_type +class ObjectType(AttributeType): + type_name = "object" + dtype = "json" + requires_store = True # Must specify @store + + def encode(self, value, *, key=None) -> str: + # Upload file/folder to object storage + # Return JSON metadata + ... + + def decode(self, json_str, *, key=None) -> ObjectRef: + # Return ObjectRef handle (no download) + ... +``` + +```python +# Table definition +class LargeData(dj.Manual): + definition = """ + id : int + --- + zarr_data : + """ + +# Usage +table.insert1({'id': 1, 'zarr_data': '/path/to/data.zarr'}) +row = (table & 'id=1').fetch1() +ref = row['zarr_data'] # ObjectRef handle +ref.download('/local/path') # Explicit download +# Or direct access via fsspec +``` + +### Storage Location Modifier (`@store`) + +The `@store` suffix is orthogonal to the type and specifies external storage: + +| Type | Without @store | With @store | +|------|---------------|-------------| +| `longblob` | Raw bytes in DB | Raw bytes in external store | +| `` | Serialized in DB | Serialized in external store | +| `` | File in DB | File in external store | +| `` | N/A (error) | Path reference in external store | +| `` | N/A (error) | Object in external store | + +Implementation: +- `@store` changes the underlying `dtype` to `binary(16)` (UUID) +- Creates FK relationship to `~external_{store}` tracking table +- AttributeType's `encode()`/`decode()` work with the external table transparently + +### Extended AttributeType Interface + +For types that interact with the filesystem, we extend the base interface: + +```python +class FileSystemType(AttributeType): + """Base for types that work with file paths.""" + + # Standard interface + def encode(self, value, *, key=None) -> bytes | str: + """Convert input (path or value) to stored representation.""" + ... + + def decode(self, stored, *, key=None) -> str: + """Convert stored representation to local file path.""" + ... + + # Extended interface for external storage + def upload(self, filepath: Path, external: ExternalTable) -> uuid.UUID: + """Upload file to external storage, return UUID.""" + ... + + def download(self, uuid: uuid.UUID, external: ExternalTable, + download_path: Path) -> Path: + """Download from external storage to local path.""" + ... +``` + +### Configuration + +```python +# datajoint config +dj.config['stores'] = { + 'main': { + 'protocol': 's3', + 'endpoint': 's3.amazonaws.com', + 'bucket': 'my-bucket', + 'location': 'datajoint/', + }, + 'archive': { + 'protocol': 'file', + 'location': '/mnt/archive/', + } +} + +dj.config['download_path'] = '/tmp/dj_downloads' # For attach +dj.config['stage'] = '/data/stage' # For filepath +``` + +## Migration Path + +### Phase 1: Current State (Done) +- `` AttributeType implemented +- `longblob` returns raw bytes +- Legacy `AttributeAdapter` wrapped for backward compat + +### Phase 2: Attach as AttributeType +- Implement `` and `` as AttributeType +- Deprecate bare `attach` type (still works, emits warning) +- Move logic from table.py/fetch.py to AttachType class + +### Phase 3: Filepath as AttributeType +- Implement `` as AttributeType +- Deprecate `filepath@store` syntax (redirect to ``) + +### Phase 4: Object Type Refinement +- Already implemented as separate system +- Ensure consistent with AttributeType patterns +- Consider `` syntax + +### Phase 5: Cleanup +- Remove scattered type handling from table.py, fetch.py +- Consolidate external storage logic +- Update documentation + +## Summary + +| Type | Input | Output | Internal | External | Use Case | +|------|-------|--------|----------|----------|----------| +| `longblob` | bytes | bytes | ✓ | ✓ | Raw binary data | +| `` | any | any | ✓ | ✓ | Python objects, arrays | +| `` | path | path | ✓ | ✓ | Files with filename | +| `` | path | path | ✗ | ✓ | Managed file workflows | +| `` | path/ref | ObjectRef | ✗ | ✓ | Large files, Zarr, HDF5 | + +This design: +1. Makes all custom types consistent AttributeTypes +2. Separates storage location (`@store`) from encoding behavior +3. Provides clear semantics for each type +4. Enables gradual migration from current implementation From ecac82de457af638d648465b0c6b4948e6fd9e9f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:20:48 +0000 Subject: [PATCH 11/42] Update storage types spec with OAS integration approach - Clarify OAS (object type) as distinct system - Propose storing blob@store/attach@store in OAS _external/ folder - Content-addressed deduplication via hash stored in varchar(64) - Propose to replace filepath@store - Add open questions and implementation phases Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 495 +++++++++---------- 1 file changed, 235 insertions(+), 260 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 2247164d2..79627a990 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -2,100 +2,56 @@ ## Overview -This document proposes a redesign of DataJoint's storage types (`blob`, `attach`, `filepath`, `object`) as a coherent system built on the `AttributeType` base class. +This document proposes a redesign of DataJoint's storage types as AttributeTypes, with clear separation between: -## Current State Analysis +1. **Object-Augmented Schemas (OAS)** - New paradigm with managed stores, integrity constraints, and prescribed organization +2. **Legacy External Storage** - Content-addressed blob/attach storage with deduplication +3. **Internal Blob Types** - AttributeTypes that serialize into database blob columns -### Existing Types +## Type Categories -| Type | DB Column | Storage | Semantics | -|------|-----------|---------|-----------| -| `longblob` | LONGBLOB | Internal | Raw bytes | -| `blob@store` | binary(16) | External | Raw bytes via UUID | -| `attach` | LONGBLOB | Internal | `filename\0contents` | -| `attach@store` | binary(16) | External | File via UUID | -| `filepath@store` | binary(16) | External | Path-addressed file reference | -| `object` | JSON | External | Managed file/folder with ObjectRef | +### 1. Object-Augmented Schemas (`object`, `object@store`) -### Problems with Current Design +**Already implemented.** A distinct system where stores are treated as part of the database: -1. **Scattered implementation**: Logic split across `declare.py`, `table.py`, `fetch.py`, `external.py` -2. **Inconsistent patterns**: Some types use AttributeType, others are hardcoded -3. **Implicit behaviors**: `longblob` previously auto-serialized, now raw -4. **Overlapping semantics**: `blob@store` vs `attach@store` unclear -5. **No internal object type**: `object` always requires external store - -## Proposed Architecture - -### Core Concepts - -1. **Storage Location** (orthogonal to type): - - **Internal**: Data stored directly in database column - - **External**: Data stored in external storage, UUID reference in database - -2. **Content Model** (what the type represents): - - **Binary**: Raw bytes (no interpretation) - - **Serialized**: Python objects encoded via DJ blob format - - **File**: Single file with filename metadata - - **Folder**: Directory structure - - **Reference**: Pointer to externally-managed file (path-addressed) - -3. **AttributeType** handles encoding/decoding between Python values and stored representation - -### Type Hierarchy - -``` - AttributeType (base) - │ - ┌─────────────────┼─────────────────┐ - │ │ │ - BinaryType SerializedType FileSystemType - (passthrough) (pack/unpack) │ - │ │ ┌──────┴──────┐ - │ │ │ │ - longblob - longblob@store filepath@store -``` - -### Proposed Types - -#### 1. Raw Binary (`longblob`, `blob`, etc.) - -**Not an AttributeType** - these are primitive MySQL types. - -- Store/return raw bytes without transformation -- `@store` variant stores externally with content-addressed UUID -- No encoding/decoding needed +- Robust integrity constraints +- Prescribed path organization (derived from primary key) +- Multiple store support via config +- Returns `ObjectRef` for lazy access +- Supports direct writes (Zarr, HDF5) via fsspec ```python # Table definition -class RawData(dj.Manual): +class Analysis(dj.Computed): definition = """ - id : int + -> Recording --- - data : longblob # raw bytes in DB - large_data : blob@store # raw bytes externally + results : object@main # stored in 'main' OAS store """ # Usage -table.insert1({'id': 1, 'data': b'raw bytes', 'large_data': b'large raw bytes'}) -row = (table & 'id=1').fetch1() -assert row['data'] == b'raw bytes' # bytes returned +row = (Analysis & key).fetch1() +ref = row['results'] # ObjectRef handle (lazy) +ref.download('/local/path') # explicit download +data = ref.open() # fsspec access ``` -#### 2. Serialized Objects (``) +**This type is NOT part of the AttributeType redesign** - it has its own implementation path. + +--- -**AttributeType** with DJ blob serialization. +### 2. Serialized Blobs (``) + +**Already implemented.** AttributeType for Python object serialization. - Input: Any Python object (arrays, dicts, lists, etc.) - Output: Same Python object reconstructed -- Storage: DJ blob format (mYm/dj0 protocol) +- Storage: DJ blob format (mYm/dj0 protocol) in LONGBLOB column ```python -@dj.register_type class DJBlobType(AttributeType): type_name = "djblob" - dtype = "longblob" # or "longblob@store" for external + dtype = "longblob" def encode(self, value, *, key=None) -> bytes: return blob.pack(value, compress=True) @@ -104,260 +60,279 @@ class DJBlobType(AttributeType): return blob.unpack(stored) ``` -```python -# Table definition -class ProcessedData(dj.Manual): - definition = """ - id : int - --- - result : # serialized in DB - large_result : # serialized externally - """ - -# Usage -table.insert1({'id': 1, 'result': {'array': np.array([1,2,3]), 'meta': 'info'}}) -row = (table & 'id=1').fetch1() -assert row['result']['meta'] == 'info' # Python dict returned -``` +--- -#### 3. File Attachments (``) +### 3. File Attachments (``) - TO IMPLEMENT -**AttributeType** for file storage with filename preservation. +AttributeType for serializing files into internal blob columns. - Input: File path (string or Path) -- Output: Local file path after download -- Storage: File contents with filename metadata +- Output: Local file path after extraction +- Storage: `filename\0contents` in LONGBLOB column ```python @dj.register_type class AttachType(AttributeType): type_name = "attach" - dtype = "longblob" # or "longblob@store" for external + dtype = "longblob" - # For internal storage def encode(self, filepath, *, key=None) -> bytes: path = Path(filepath) return path.name.encode() + b"\0" + path.read_bytes() def decode(self, stored, *, key=None) -> str: filename, contents = stored.split(b"\0", 1) - # Download to configured path, return local filepath - ... + download_path = Path(dj.config['download_path']) / filename + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(contents) + return str(download_path) ``` -**Key difference from blob**: Preserves original filename, returns file path not bytes. - +**Usage:** ```python -# Table definition -class Attachments(dj.Manual): +class Configs(dj.Manual): definition = """ - id : int + config_id : int --- - config_file : # small file in DB - data_file : # large file externally + config_file : # file serialized into DB """ -# Usage -table.insert1({'id': 1, 'config_file': '/path/to/config.yaml'}) -row = (table & 'id=1').fetch1() -# row['config_file'] == '/downloads/config.yaml' # local path +# Insert +table.insert1({'config_id': 1, 'config_file': '/path/to/config.yaml'}) + +# Fetch - file extracted to download_path +row = (table & 'config_id=1').fetch1() +local_path = row['config_file'] # '/downloads/config.yaml' ``` -#### 4. Filepath References (``) +--- + +### 4. External Content-Addressed Storage (``, ``) - TO DESIGN + +These types store content externally with deduplication via content hashing. -**AttributeType** for tracking externally-managed files. +#### Design Option A: Leverage OAS Stores -- Input: File path in staging area -- Output: Local file path after sync -- Storage: Path-addressed (UUID = hash of relative path, not contents) -- Tracks `contents_hash` separately for verification +Store content-addressed blobs within OAS stores under a reserved folder: + +``` +store_root/ +├── _external/ # Reserved for content-addressed storage +│ ├── blobs/ # For +│ │ └── ab/cd/abcd1234... # Path derived from content hash +│ └── attach/ # For +│ └── ef/gh/efgh5678.../filename.ext +└── schema_name/ # Normal OAS paths + └── table_name/ + └── pk_value/ +``` + +**Advantages:** +- Reuses OAS infrastructure (fsspec, store config) +- DataJoint fully controls paths +- Deduplication via content hash +- No separate `~external_*` tracking tables needed + +**Implementation:** ```python +class ContentAddressedType(AttributeType): + """Base class for content-addressed external storage.""" + + subfolder: str # 'blobs' or 'attach' + + def _content_hash(self, data: bytes) -> str: + """Compute content hash for deduplication.""" + return hashlib.sha256(data).hexdigest() + + def _store_path(self, content_hash: str) -> str: + """Generate path within _external folder.""" + return f"_external/{self.subfolder}/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + + @dj.register_type -class FilepathType(AttributeType): - type_name = "filepath" - dtype = "binary(16)" # Always external (UUID reference) - requires_store = True # Must specify @store +class DJBlobExternalType(ContentAddressedType): + type_name = "djblob" # Same name, different dtype triggers external + dtype = "varchar(64)" # Store content hash as string + subfolder = "blobs" + + def encode(self, value, *, key=None, store=None) -> str: + data = blob.pack(value, compress=True) + content_hash = self._content_hash(data) + path = self._store_path(content_hash) + # Upload to store if not exists (deduplication) + store.put_if_absent(path, data) + return content_hash + + def decode(self, content_hash, *, key=None, store=None) -> Any: + path = self._store_path(content_hash) + data = store.get(path) + return blob.unpack(data) - def encode(self, filepath, *, key=None) -> bytes: - # Compute UUID from relative path - # Track contents_hash separately - ... - def decode(self, uuid_bytes, *, key=None) -> str: - # Sync file from remote to local stage - # Verify contents_hash - # Return local path +@dj.register_type +class AttachExternalType(ContentAddressedType): + type_name = "attach" + dtype = "varchar(64)" + subfolder = "attach" + + def encode(self, filepath, *, key=None, store=None) -> str: + path = Path(filepath) + # Hash includes filename for uniqueness + data = path.name.encode() + b"\0" + path.read_bytes() + content_hash = self._content_hash(data) + store_path = self._store_path(content_hash) + "/" + path.name + store.put_if_absent(store_path, path.read_bytes()) + return content_hash + + def decode(self, content_hash, *, key=None, store=None) -> str: + # List files in hash folder to get filename ... ``` -**Key difference from attach**: -- Path-addressed (same path = same UUID, even if contents change) -- Designed for managed file workflows where files may be updated -- Always external (no internal variant) +#### Design Option B: Separate Tracking Tables (Current Approach) -```python -# Table definition -class ManagedFiles(dj.Manual): - definition = """ - id : int - --- - data_path : - """ +Keep `~external_{store}` tables for tracking: -# Usage - file must be in configured stage directory -table.insert1({'id': 1, 'data_path': '/stage/experiment_001/data.h5'}) -row = (table & 'id=1').fetch1() -# row['data_path'] == '/local_stage/experiment_001/data.h5' +```sql +-- ~external_main +hash : binary(16) # UUID from content hash +--- +size : bigint +attachment_name: varchar(255) # for attach only +timestamp : timestamp ``` -#### 5. Managed Objects (``) +**Disadvantages:** +- Separate infrastructure from OAS +- Additional table maintenance +- More complex cleanup/garbage collection -**AttributeType** for managed file/folder storage with lazy access. +#### Recommendation -- Input: File path, folder path, or ObjectRef -- Output: ObjectRef handle (lazy - no automatic download) -- Storage: JSON metadata column -- Supports direct writes (Zarr, HDF5) via fsspec +**Option A (OAS integration)** is cleaner: +- Single storage paradigm +- Simpler mental model +- Content hash stored directly in column (no UUID indirection) +- Deduplication at storage level + +--- + +### 5. Reference Tracking (``) - TO DESIGN + +Repurpose `filepath@store` as a general reference type, borrowing from ObjRef: + +**Current `filepath@store` limitations:** +- Path-addressed (hash of path, not contents) +- Requires staging area +- Archaic copy-to/copy-from model + +**Proposed ``:** +- Track references to external resources +- Support multiple reference types (file path, URL, object key) +- Borrow lazy access patterns from ObjRef +- Optional content verification ```python @dj.register_type -class ObjectType(AttributeType): - type_name = "object" +class RefType(AttributeType): + type_name = "ref" dtype = "json" - requires_store = True # Must specify @store - - def encode(self, value, *, key=None) -> str: - # Upload file/folder to object storage - # Return JSON metadata - ... - def decode(self, json_str, *, key=None) -> ObjectRef: - # Return ObjectRef handle (no download) - ... + def encode(self, value, *, key=None, store=None) -> str: + if isinstance(value, str): + # Treat as path/URL + return json.dumps({ + 'type': 'path', + 'path': value, + 'store': store.name, + 'content_hash': self._compute_hash(value) if verify else None + }) + elif isinstance(value, RefSpec): + return json.dumps(value.to_dict()) + + def decode(self, json_str, *, key=None, store=None) -> Ref: + data = json.loads(json_str) + return Ref(data, store=store) + + +class Ref: + """Reference handle (similar to ObjectRef).""" + + def __init__(self, data, store): + self.path = data['path'] + self.store = store + self._content_hash = data.get('content_hash') + + def download(self, local_path): + """Download referenced file.""" + self.store.download(self.path, local_path) + if self._content_hash: + self._verify(local_path) + + def open(self, mode='rb'): + """Open via fsspec (lazy).""" + return self.store.open(self.path, mode) ``` +**Usage:** ```python -# Table definition -class LargeData(dj.Manual): +class ExternalData(dj.Manual): definition = """ - id : int + data_id : int --- - zarr_data : + source : # reference to external file """ -# Usage -table.insert1({'id': 1, 'zarr_data': '/path/to/data.zarr'}) -row = (table & 'id=1').fetch1() -ref = row['zarr_data'] # ObjectRef handle -ref.download('/local/path') # Explicit download -# Or direct access via fsspec +# Insert - just tracks the reference +table.insert1({'data_id': 1, 'source': '/archive/experiment_001/data.h5'}) + +# Fetch - returns Ref handle +row = (table & 'data_id=1').fetch1() +ref = row['source'] +ref.download('/local/data.h5') # explicit download ``` -### Storage Location Modifier (`@store`) +--- -The `@store` suffix is orthogonal to the type and specifies external storage: +## Summary of Types -| Type | Without @store | With @store | -|------|---------------|-------------| -| `longblob` | Raw bytes in DB | Raw bytes in external store | -| `` | Serialized in DB | Serialized in external store | -| `` | File in DB | File in external store | -| `` | N/A (error) | Path reference in external store | -| `` | N/A (error) | Object in external store | +| Type | Storage | Column | Input | Output | Dedup | +|------|---------|--------|-------|--------|-------| +| `object@store` | OAS store | JSON | path/ref | ObjectRef | By path | +| `` | Internal | LONGBLOB | any | any | No | +| `` | OAS `_external/` | varchar(64) | any | any | By content | +| `` | Internal | LONGBLOB | path | path | No | +| `` | OAS `_external/` | varchar(64) | path | path | By content | +| `` | OAS store | JSON | path/ref | Ref | No (tracks) | -Implementation: -- `@store` changes the underlying `dtype` to `binary(16)` (UUID) -- Creates FK relationship to `~external_{store}` tracking table -- AttributeType's `encode()`/`decode()` work with the external table transparently +## Open Questions -### Extended AttributeType Interface +1. **Store syntax**: Should external AttributeTypes use `` or detect externality from dtype? -For types that interact with the filesystem, we extend the base interface: +2. **Backward compatibility**: How to handle existing `blob@store` and `attach@store` columns with `~external_*` tables? -```python -class FileSystemType(AttributeType): - """Base for types that work with file paths.""" +3. **Deduplication scope**: Per-store or global across stores? - # Standard interface - def encode(self, value, *, key=None) -> bytes | str: - """Convert input (path or value) to stored representation.""" - ... +4. **Ref vs filepath**: Deprecate `filepath@store` entirely or keep as alias? - def decode(self, stored, *, key=None) -> str: - """Convert stored representation to local file path.""" - ... +5. **Content hash format**: SHA256 hex (64 chars) or shorter hash? - # Extended interface for external storage - def upload(self, filepath: Path, external: ExternalTable) -> uuid.UUID: - """Upload file to external storage, return UUID.""" - ... +## Implementation Phases - def download(self, uuid: uuid.UUID, external: ExternalTable, - download_path: Path) -> Path: - """Download from external storage to local path.""" - ... -``` +### Phase 1: `` Internal +- Implement AttachType for internal blob storage +- Deprecate bare `attach` keyword (still works, warns) -### Configuration +### Phase 2: Content-Addressed External +- Implement ContentAddressedType base +- Add `` and `` +- Store in OAS `_external/` folder -```python -# datajoint config -dj.config['stores'] = { - 'main': { - 'protocol': 's3', - 'endpoint': 's3.amazonaws.com', - 'bucket': 'my-bucket', - 'location': 'datajoint/', - }, - 'archive': { - 'protocol': 'file', - 'location': '/mnt/archive/', - } -} - -dj.config['download_path'] = '/tmp/dj_downloads' # For attach -dj.config['stage'] = '/data/stage' # For filepath -``` +### Phase 3: Reference Type +- Implement `` with Ref handle +- Deprecate `filepath@store` -## Migration Path - -### Phase 1: Current State (Done) -- `` AttributeType implemented -- `longblob` returns raw bytes -- Legacy `AttributeAdapter` wrapped for backward compat - -### Phase 2: Attach as AttributeType -- Implement `` and `` as AttributeType -- Deprecate bare `attach` type (still works, emits warning) -- Move logic from table.py/fetch.py to AttachType class - -### Phase 3: Filepath as AttributeType -- Implement `` as AttributeType -- Deprecate `filepath@store` syntax (redirect to ``) - -### Phase 4: Object Type Refinement -- Already implemented as separate system -- Ensure consistent with AttributeType patterns -- Consider `` syntax - -### Phase 5: Cleanup -- Remove scattered type handling from table.py, fetch.py -- Consolidate external storage logic -- Update documentation - -## Summary - -| Type | Input | Output | Internal | External | Use Case | -|------|-------|--------|----------|----------|----------| -| `longblob` | bytes | bytes | ✓ | ✓ | Raw binary data | -| `` | any | any | ✓ | ✓ | Python objects, arrays | -| `` | path | path | ✓ | ✓ | Files with filename | -| `` | path | path | ✗ | ✓ | Managed file workflows | -| `` | path/ref | ObjectRef | ✗ | ✓ | Large files, Zarr, HDF5 | - -This design: -1. Makes all custom types consistent AttributeTypes -2. Separates storage location (`@store`) from encoding behavior -3. Provides clear semantics for each type -4. Enables gradual migration from current implementation +### Phase 4: Migration Tools +- Tools to migrate `~external_*` data to new format +- Backward compat layer for reading old format From 7e7f9682d9336e80f833c4e3f11496609a4482d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:31:52 +0000 Subject: [PATCH 12/42] Unify external storage under OAS with content-addressed region - All external storage uses OAS infrastructure - Path-addressed: regular object@store (existing) - Content-addressed: _content/ folder for , - ContentRegistry table for reference counting and GC - ObjectRef returned for all external types (lazy access) - Deduplication via SHA256 content hash Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 469 +++++++++---------- 1 file changed, 223 insertions(+), 246 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 79627a990..844564755 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -2,337 +2,314 @@ ## Overview -This document proposes a redesign of DataJoint's storage types as AttributeTypes, with clear separation between: +This document proposes a unified storage architecture where all external storage uses the Object-Augmented Schema (OAS) paradigm, with a special content-addressable region for deduplicated objects. -1. **Object-Augmented Schemas (OAS)** - New paradigm with managed stores, integrity constraints, and prescribed organization -2. **Legacy External Storage** - Content-addressed blob/attach storage with deduplication -3. **Internal Blob Types** - AttributeTypes that serialize into database blob columns +## Architecture -## Type Categories +### Two Storage Modes within OAS -### 1. Object-Augmented Schemas (`object`, `object@store`) +``` +store_root/ +├── {schema}/{table}/{pk}/ # Path-addressed (regular OAS) +│ └── {attribute}/ # Derived from primary key +│ └── ... # Files, folders, Zarr, etc. +│ +└── _content/ # Content-addressed (deduplicated) + └── {hash[:2]}/{hash[2:4]}/ + └── {hash}/ # Full SHA256 hash + └── ... # Object contents +``` -**Already implemented.** A distinct system where stores are treated as part of the database: +### 1. Path-Addressed Objects (`object@store`) -- Robust integrity constraints -- Prescribed path organization (derived from primary key) -- Multiple store support via config +**Already implemented.** Regular OAS behavior: +- Path derived from primary key +- One-to-one relationship with table row +- Deleted when row is deleted - Returns `ObjectRef` for lazy access -- Supports direct writes (Zarr, HDF5) via fsspec ```python -# Table definition class Analysis(dj.Computed): definition = """ -> Recording --- - results : object@main # stored in 'main' OAS store + results : object@main """ - -# Usage -row = (Analysis & key).fetch1() -ref = row['results'] # ObjectRef handle (lazy) -ref.download('/local/path') # explicit download -data = ref.open() # fsspec access ``` -**This type is NOT part of the AttributeType redesign** - it has its own implementation path. +### 2. Content-Addressed Objects (``, ``) ---- - -### 2. Serialized Blobs (``) - -**Already implemented.** AttributeType for Python object serialization. - -- Input: Any Python object (arrays, dicts, lists, etc.) -- Output: Same Python object reconstructed -- Storage: DJ blob format (mYm/dj0 protocol) in LONGBLOB column +**New.** Stored in `_content/` region with deduplication: +- Path derived from content hash (SHA256) +- Many-to-one: multiple rows can reference same object +- Reference counted for garbage collection +- Returns `ObjectRef` for lazy access (same as regular OAS) ```python -class DJBlobType(AttributeType): - type_name = "djblob" - dtype = "longblob" - - def encode(self, value, *, key=None) -> bytes: - return blob.pack(value, compress=True) - - def decode(self, stored, *, key=None) -> Any: - return blob.unpack(stored) +class ProcessedData(dj.Computed): + definition = """ + -> RawData + --- + features : # Serialized Python object, deduplicated + source_file : # File attachment, deduplicated + """ ``` ---- +## Content-Addressed Storage Design -### 3. File Attachments (``) - TO IMPLEMENT - -AttributeType for serializing files into internal blob columns. - -- Input: File path (string or Path) -- Output: Local file path after extraction -- Storage: `filename\0contents` in LONGBLOB column +### Storage Path ```python -@dj.register_type -class AttachType(AttributeType): - type_name = "attach" - dtype = "longblob" - - def encode(self, filepath, *, key=None) -> bytes: - path = Path(filepath) - return path.name.encode() + b"\0" + path.read_bytes() +def content_path(content_hash: str) -> str: + """Generate path for content-addressed object.""" + return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - def decode(self, stored, *, key=None) -> str: - filename, contents = stored.split(b"\0", 1) - download_path = Path(dj.config['download_path']) / filename - download_path.parent.mkdir(parents=True, exist_ok=True) - download_path.write_bytes(contents) - return str(download_path) +# Example: hash "a1b2c3d4..." -> "_content/a1/b2/a1b2c3d4..." ``` -**Usage:** +### Reference Registry + +A schema-level table tracks content-addressed objects for reference counting: + ```python -class Configs(dj.Manual): +class ContentRegistry: + """ + Tracks content-addressed objects for garbage collection. + One per schema, created automatically when content-addressed types are used. + """ definition = """ - config_id : int + # Content-addressed object registry + content_hash : char(64) # SHA256 hex --- - config_file : # file serialized into DB + store : varchar(64) # Store name + size : bigint unsigned # Object size in bytes + created : timestamp DEFAULT CURRENT_TIMESTAMP """ +``` -# Insert -table.insert1({'config_id': 1, 'config_file': '/path/to/config.yaml'}) +### Reference Counting -# Fetch - file extracted to download_path -row = (table & 'config_id=1').fetch1() -local_path = row['config_file'] # '/downloads/config.yaml' -``` +Reference counting is implicit via database queries: ---- +```python +def find_orphans(schema) -> list[tuple[str, str]]: + """Find content hashes not referenced by any table.""" + + # Get all registered hashes + registered = set(ContentRegistry().fetch('content_hash', 'store')) + + # Get all referenced hashes from tables + referenced = set() + for table in schema.tables: + for attr in table.heading.attributes: + if attr.is_content_addressed: + hashes = table.fetch(attr.name) + referenced.update((h, attr.store) for h in hashes) + + return registered - referenced + +def garbage_collect(schema): + """Remove orphaned content-addressed objects.""" + for content_hash, store in find_orphans(schema): + # Delete from storage + store_backend = get_store(store) + store_backend.delete(content_path(content_hash)) + # Delete from registry + (ContentRegistry() & {'content_hash': content_hash}).delete() +``` -### 4. External Content-Addressed Storage (``, ``) - TO DESIGN +### ObjectRef for Content-Addressed Objects -These types store content externally with deduplication via content hashing. +Content-addressed objects return `ObjectRef` just like regular OAS objects: -#### Design Option A: Leverage OAS Stores +```python +row = (ProcessedData & key).fetch1() -Store content-addressed blobs within OAS stores under a reserved folder: +# Both return ObjectRef +results_ref = row['features'] # +file_ref = row['source_file'] # -``` -store_root/ -├── _external/ # Reserved for content-addressed storage -│ ├── blobs/ # For -│ │ └── ab/cd/abcd1234... # Path derived from content hash -│ └── attach/ # For -│ └── ef/gh/efgh5678.../filename.ext -└── schema_name/ # Normal OAS paths - └── table_name/ - └── pk_value/ +# Same interface as regular OAS +results_ref.download('/local/path') +data = results_ref.load() # For djblob: deserialize +local_path = file_ref.download() # For attach: download, return path ``` -**Advantages:** -- Reuses OAS infrastructure (fsspec, store config) -- DataJoint fully controls paths -- Deduplication via content hash -- No separate `~external_*` tracking tables needed +## AttributeType Implementations -**Implementation:** +### `` - Internal Serialized Blob ```python -class ContentAddressedType(AttributeType): - """Base class for content-addressed external storage.""" - - subfolder: str # 'blobs' or 'attach' +@dj.register_type +class DJBlobType(AttributeType): + type_name = "djblob" + dtype = "longblob" - def _content_hash(self, data: bytes) -> str: - """Compute content hash for deduplication.""" - return hashlib.sha256(data).hexdigest() + def encode(self, value, *, key=None) -> bytes: + from . import blob + return blob.pack(value, compress=True) - def _store_path(self, content_hash: str) -> str: - """Generate path within _external folder.""" - return f"_external/{self.subfolder}/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + def decode(self, stored, *, key=None) -> Any: + from . import blob + return blob.unpack(stored) +``` +### `` - External Serialized Blob (Content-Addressed) +```python @dj.register_type -class DJBlobExternalType(ContentAddressedType): - type_name = "djblob" # Same name, different dtype triggers external - dtype = "varchar(64)" # Store content hash as string - subfolder = "blobs" +class DJBlobExternalType(AttributeType): + type_name = "djblob" + dtype = "char(64)" # Content hash stored in column + is_content_addressed = True def encode(self, value, *, key=None, store=None) -> str: + from . import blob data = blob.pack(value, compress=True) - content_hash = self._content_hash(data) - path = self._store_path(content_hash) - # Upload to store if not exists (deduplication) - store.put_if_absent(path, data) + content_hash = hashlib.sha256(data).hexdigest() + + # Upload if not exists (deduplication) + path = content_path(content_hash) + if not store.exists(path): + store.put(path, data) + ContentRegistry().insert1({ + 'content_hash': content_hash, + 'store': store.name, + 'size': len(data) + }) + return content_hash - def decode(self, content_hash, *, key=None, store=None) -> Any: - path = self._store_path(content_hash) - data = store.get(path) - return blob.unpack(data) + def decode(self, content_hash, *, key=None, store=None) -> ObjectRef: + # Return ObjectRef for lazy access + return ObjectRef( + path=content_path(content_hash), + store=store, + loader=blob.unpack # Custom loader for deserialization + ) +``` +### `` - Internal File Attachment +```python @dj.register_type -class AttachExternalType(ContentAddressedType): +class AttachType(AttributeType): type_name = "attach" - dtype = "varchar(64)" - subfolder = "attach" + dtype = "longblob" - def encode(self, filepath, *, key=None, store=None) -> str: + def encode(self, filepath, *, key=None) -> bytes: path = Path(filepath) - # Hash includes filename for uniqueness - data = path.name.encode() + b"\0" + path.read_bytes() - content_hash = self._content_hash(data) - store_path = self._store_path(content_hash) + "/" + path.name - store.put_if_absent(store_path, path.read_bytes()) - return content_hash - - def decode(self, content_hash, *, key=None, store=None) -> str: - # List files in hash folder to get filename - ... -``` - -#### Design Option B: Separate Tracking Tables (Current Approach) - -Keep `~external_{store}` tables for tracking: + return path.name.encode() + b"\0" + path.read_bytes() -```sql --- ~external_main -hash : binary(16) # UUID from content hash ---- -size : bigint -attachment_name: varchar(255) # for attach only -timestamp : timestamp + def decode(self, stored, *, key=None) -> str: + filename, contents = stored.split(b"\0", 1) + filename = filename.decode() + download_path = Path(dj.config['download_path']) / filename + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(contents) + return str(download_path) ``` -**Disadvantages:** -- Separate infrastructure from OAS -- Additional table maintenance -- More complex cleanup/garbage collection - -#### Recommendation - -**Option A (OAS integration)** is cleaner: -- Single storage paradigm -- Simpler mental model -- Content hash stored directly in column (no UUID indirection) -- Deduplication at storage level - ---- - -### 5. Reference Tracking (``) - TO DESIGN - -Repurpose `filepath@store` as a general reference type, borrowing from ObjRef: - -**Current `filepath@store` limitations:** -- Path-addressed (hash of path, not contents) -- Requires staging area -- Archaic copy-to/copy-from model - -**Proposed ``:** -- Track references to external resources -- Support multiple reference types (file path, URL, object key) -- Borrow lazy access patterns from ObjRef -- Optional content verification +### `` - External File Attachment (Content-Addressed) ```python @dj.register_type -class RefType(AttributeType): - type_name = "ref" - dtype = "json" +class AttachExternalType(AttributeType): + type_name = "attach" + dtype = "char(64)" # Content hash stored in column + is_content_addressed = True - def encode(self, value, *, key=None, store=None) -> str: - if isinstance(value, str): - # Treat as path/URL - return json.dumps({ - 'type': 'path', - 'path': value, + def encode(self, filepath, *, key=None, store=None) -> str: + path = Path(filepath) + data = path.read_bytes() + # Hash includes filename for uniqueness + content_hash = hashlib.sha256( + path.name.encode() + b"\0" + data + ).hexdigest() + + # Store as folder with original filename preserved + obj_path = content_path(content_hash) + if not store.exists(obj_path): + store.put(f"{obj_path}/{path.name}", data) + ContentRegistry().insert1({ + 'content_hash': content_hash, 'store': store.name, - 'content_hash': self._compute_hash(value) if verify else None + 'size': len(data) }) - elif isinstance(value, RefSpec): - return json.dumps(value.to_dict()) - def decode(self, json_str, *, key=None, store=None) -> Ref: - data = json.loads(json_str) - return Ref(data, store=store) + return content_hash + + def decode(self, content_hash, *, key=None, store=None) -> ObjectRef: + return ObjectRef( + path=content_path(content_hash), + store=store, + # ObjectRef handles file download + ) +``` +## Unified ObjectRef Interface -class Ref: - """Reference handle (similar to ObjectRef).""" +All external storage (both path-addressed and content-addressed) returns `ObjectRef`: - def __init__(self, data, store): - self.path = data['path'] +```python +class ObjectRef: + """Lazy reference to stored object.""" + + def __init__(self, path, store, loader=None): + self.path = path self.store = store - self._content_hash = data.get('content_hash') + self._loader = loader # Optional custom deserializer - def download(self, local_path): - """Download referenced file.""" + def download(self, local_path=None) -> Path: + """Download object to local filesystem.""" + if local_path is None: + local_path = Path(dj.config['download_path']) / Path(self.path).name self.store.download(self.path, local_path) - if self._content_hash: - self._verify(local_path) + return local_path + + def load(self) -> Any: + """Load and optionally deserialize object.""" + data = self.store.get(self.path) + if self._loader: + return self._loader(data) + return data def open(self, mode='rb'): - """Open via fsspec (lazy).""" + """Open via fsspec for streaming access.""" return self.store.open(self.path, mode) ``` -**Usage:** -```python -class ExternalData(dj.Manual): - definition = """ - data_id : int - --- - source : # reference to external file - """ - -# Insert - just tracks the reference -table.insert1({'data_id': 1, 'source': '/archive/experiment_001/data.h5'}) - -# Fetch - returns Ref handle -row = (table & 'data_id=1').fetch1() -ref = row['source'] -ref.download('/local/data.h5') # explicit download -``` - ---- +## Summary -## Summary of Types +| Type | Storage | Column | Dedup | Returns | +|------|---------|--------|-------|---------| +| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef | +| `` | Internal DB | LONGBLOB | No | Python object | +| `` | `_content/{hash}/` | char(64) | Yes | ObjectRef | +| `` | Internal DB | LONGBLOB | No | Local path | +| `` | `_content/{hash}/` | char(64) | Yes | ObjectRef | -| Type | Storage | Column | Input | Output | Dedup | -|------|---------|--------|-------|--------|-------| -| `object@store` | OAS store | JSON | path/ref | ObjectRef | By path | -| `` | Internal | LONGBLOB | any | any | No | -| `` | OAS `_external/` | varchar(64) | any | any | By content | -| `` | Internal | LONGBLOB | path | path | No | -| `` | OAS `_external/` | varchar(64) | path | path | By content | -| `` | OAS store | JSON | path/ref | Ref | No (tracks) | +## Key Design Decisions -## Open Questions - -1. **Store syntax**: Should external AttributeTypes use `` or detect externality from dtype? - -2. **Backward compatibility**: How to handle existing `blob@store` and `attach@store` columns with `~external_*` tables? - -3. **Deduplication scope**: Per-store or global across stores? +1. **Unified OAS paradigm**: All external storage uses OAS infrastructure +2. **Content-addressed region**: `_content/` folder for deduplicated objects +3. **Reference counting**: Via `ContentRegistry` table + query-based orphan detection +4. **ObjectRef everywhere**: External types return ObjectRef for consistent lazy access +5. **Deduplication**: Content hash determines identity; identical content stored once -4. **Ref vs filepath**: Deprecate `filepath@store` entirely or keep as alias? +## Migration from Legacy `~external_*` -5. **Content hash format**: SHA256 hex (64 chars) or shorter hash? +For existing schemas with `~external_*` tables: -## Implementation Phases +1. Read legacy external references +2. Re-upload to `_content/` region +3. Update column values to content hashes +4. Drop `~external_*` tables +5. Create `ContentRegistry` entries -### Phase 1: `` Internal -- Implement AttachType for internal blob storage -- Deprecate bare `attach` keyword (still works, warns) - -### Phase 2: Content-Addressed External -- Implement ContentAddressedType base -- Add `` and `` -- Store in OAS `_external/` folder - -### Phase 3: Reference Type -- Implement `` with Ref handle -- Deprecate `filepath@store` +## Open Questions -### Phase 4: Migration Tools -- Tools to migrate `~external_*` data to new format -- Backward compat layer for reading old format +1. **Hash collision**: SHA256 is effectively collision-free, but should we verify on fetch? +2. **Partial uploads**: How to handle interrupted uploads? Temp path then rename? +3. **Cross-schema deduplication**: Should `_content/` be per-schema or global? +4. **Backward compat**: How long to support reading from legacy `~external_*`? From 495d7f7d667845296c3fd4f1eacbe5e81443e99e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:35:36 +0000 Subject: [PATCH 13/42] Make and return values transparently - returns Python object (fetched and deserialized) - returns local file path (downloaded automatically) - Only object@store returns ObjectRef for explicit lazy access - External storage is transparent - @store only affects where, not how Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 94 ++++++++++---------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 844564755..6b90ac164 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -43,15 +43,15 @@ class Analysis(dj.Computed): - Path derived from content hash (SHA256) - Many-to-one: multiple rows can reference same object - Reference counted for garbage collection -- Returns `ObjectRef` for lazy access (same as regular OAS) +- **Transparent access**: Returns same type as internal variant (Python object or file path) ```python class ProcessedData(dj.Computed): definition = """ -> RawData --- - features : # Serialized Python object, deduplicated - source_file : # File attachment, deduplicated + features : # Returns Python object (fetched transparently) + source_file : # Returns local file path (downloaded transparently) """ ``` @@ -118,23 +118,27 @@ def garbage_collect(schema): (ContentRegistry() & {'content_hash': content_hash}).delete() ``` -### ObjectRef for Content-Addressed Objects +### Transparent Access for Content-Addressed Objects -Content-addressed objects return `ObjectRef` just like regular OAS objects: +Content-addressed objects return the same types as their internal counterparts: ```python row = (ProcessedData & key).fetch1() -# Both return ObjectRef -results_ref = row['features'] # -file_ref = row['source_file'] # +# returns Python object (like ) +features = row['features'] # dict, array, etc. - fetched and deserialized -# Same interface as regular OAS -results_ref.download('/local/path') -data = results_ref.load() # For djblob: deserialize -local_path = file_ref.download() # For attach: download, return path +# returns local file path (like ) +local_path = row['source_file'] # '/downloads/data.csv' - downloaded automatically + +# Only object@store returns ObjectRef for explicit lazy access +ref = row['results'] # ObjectRef - user controls when to download ``` +This makes external storage transparent - users work with Python objects and file paths, +not storage references. The `@store` suffix only affects where data is stored, not how +it's accessed. + ## AttributeType Implementations ### `` - Internal Serialized Blob @@ -180,13 +184,12 @@ class DJBlobExternalType(AttributeType): return content_hash - def decode(self, content_hash, *, key=None, store=None) -> ObjectRef: - # Return ObjectRef for lazy access - return ObjectRef( - path=content_path(content_hash), - store=store, - loader=blob.unpack # Custom loader for deserialization - ) + def decode(self, content_hash, *, key=None, store=None) -> Any: + # Fetch and deserialize - transparent to user + from . import blob + path = content_path(content_hash) + data = store.get(path) + return blob.unpack(data) ``` ### `` - Internal File Attachment @@ -227,7 +230,7 @@ class AttachExternalType(AttributeType): path.name.encode() + b"\0" + data ).hexdigest() - # Store as folder with original filename preserved + # Store with original filename preserved obj_path = content_path(content_hash) if not store.exists(obj_path): store.put(f"{obj_path}/{path.name}", data) @@ -239,26 +242,29 @@ class AttachExternalType(AttributeType): return content_hash - def decode(self, content_hash, *, key=None, store=None) -> ObjectRef: - return ObjectRef( - path=content_path(content_hash), - store=store, - # ObjectRef handles file download - ) + def decode(self, content_hash, *, key=None, store=None) -> str: + # Download and return local path - transparent to user + obj_path = content_path(content_hash) + # List to get filename (stored as {hash}/{filename}) + filename = store.list(obj_path)[0] + download_path = Path(dj.config['download_path']) / filename + download_path.parent.mkdir(parents=True, exist_ok=True) + store.download(f"{obj_path}/{filename}", download_path) + return str(download_path) ``` -## Unified ObjectRef Interface +## ObjectRef Interface (for `object@store` only) -All external storage (both path-addressed and content-addressed) returns `ObjectRef`: +Only `object@store` returns `ObjectRef` for explicit lazy access. This is intentional - +large files and folders (Zarr, HDF5, etc.) benefit from user-controlled download/access. ```python class ObjectRef: - """Lazy reference to stored object.""" + """Lazy reference to stored object (object@store only).""" - def __init__(self, path, store, loader=None): + def __init__(self, path, store): self.path = path self.store = store - self._loader = loader # Optional custom deserializer def download(self, local_path=None) -> Path: """Download object to local filesystem.""" @@ -267,35 +273,33 @@ class ObjectRef: self.store.download(self.path, local_path) return local_path - def load(self) -> Any: - """Load and optionally deserialize object.""" - data = self.store.get(self.path) - if self._loader: - return self._loader(data) - return data - def open(self, mode='rb'): - """Open via fsspec for streaming access.""" + """Open via fsspec for streaming/direct access.""" return self.store.open(self.path, mode) + + def exists(self) -> bool: + """Check if object exists in store.""" + return self.store.exists(self.path) ``` ## Summary | Type | Storage | Column | Dedup | Returns | |------|---------|--------|-------|---------| -| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef | +| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef (lazy) | | `` | Internal DB | LONGBLOB | No | Python object | -| `` | `_content/{hash}/` | char(64) | Yes | ObjectRef | -| `` | Internal DB | LONGBLOB | No | Local path | -| `` | `_content/{hash}/` | char(64) | Yes | ObjectRef | +| `` | `_content/{hash}/` | char(64) | Yes | Python object | +| `` | Internal DB | LONGBLOB | No | Local file path | +| `` | `_content/{hash}/` | char(64) | Yes | Local file path | ## Key Design Decisions 1. **Unified OAS paradigm**: All external storage uses OAS infrastructure 2. **Content-addressed region**: `_content/` folder for deduplicated objects 3. **Reference counting**: Via `ContentRegistry` table + query-based orphan detection -4. **ObjectRef everywhere**: External types return ObjectRef for consistent lazy access -5. **Deduplication**: Content hash determines identity; identical content stored once +4. **Transparent access**: `` and `` return same types as internal variants +5. **Lazy access for objects**: Only `object@store` returns ObjectRef (for large files/folders) +6. **Deduplication**: Content hash determines identity; identical content stored once ## Migration from Legacy `~external_*` From 7ae8f1577396b08a61eb8f271a69e0f508a87103 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:44:06 +0000 Subject: [PATCH 14/42] Introduce layered storage architecture with content core type Three-layer architecture: 1. MySQL types: longblob, varchar, etc. 2. Core DataJoint types: object, content (and @store variants) 3. AttributeTypes: , , , New core type `content` for content-addressed storage: - Accepts bytes, returns bytes - Handles hashing, deduplication, and GC registration - AttributeTypes like build serialization on top Naming convention: - = internal serialized (database) - = external serialized (content-addressed) - = internal file - = external file Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 398 +++++++++---------- 1 file changed, 190 insertions(+), 208 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 6b90ac164..3b48bb50a 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -2,152 +2,107 @@ ## Overview -This document proposes a unified storage architecture where all external storage uses the Object-Augmented Schema (OAS) paradigm, with a special content-addressable region for deduplicated objects. +This document defines a layered storage architecture: -## Architecture +1. **MySQL types**: `longblob`, `varchar`, `int`, etc. +2. **Core DataJoint types**: `object`, `content` (and their `@store` variants) +3. **AttributeTypes**: ``, ``, ``, etc. (built on top of core types) -### Two Storage Modes within OAS +## Core Types -``` -store_root/ -├── {schema}/{table}/{pk}/ # Path-addressed (regular OAS) -│ └── {attribute}/ # Derived from primary key -│ └── ... # Files, folders, Zarr, etc. -│ -└── _content/ # Content-addressed (deduplicated) - └── {hash[:2]}/{hash[2:4]}/ - └── {hash}/ # Full SHA256 hash - └── ... # Object contents -``` +### `object` / `object@store` - Path-Addressed Storage -### 1. Path-Addressed Objects (`object@store`) +**Already implemented.** OAS (Object-Augmented Schema) storage: -**Already implemented.** Regular OAS behavior: -- Path derived from primary key +- Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/` - One-to-one relationship with table row - Deleted when row is deleted - Returns `ObjectRef` for lazy access +- Supports direct writes (Zarr, HDF5) via fsspec ```python class Analysis(dj.Computed): definition = """ -> Recording --- - results : object@main + results : object # default store + archive : object@cold # specific store """ ``` -### 2. Content-Addressed Objects (``, ``) +### `content` / `content@store` - Content-Addressed Storage -**New.** Stored in `_content/` region with deduplication: -- Path derived from content hash (SHA256) -- Many-to-one: multiple rows can reference same object -- Reference counted for garbage collection -- **Transparent access**: Returns same type as internal variant (Python object or file path) +**New core type.** Content-addressed storage with deduplication: -```python -class ProcessedData(dj.Computed): - definition = """ - -> RawData - --- - features : # Returns Python object (fetched transparently) - source_file : # Returns local file path (downloaded transparently) - """ -``` - -## Content-Addressed Storage Design - -### Storage Path - -```python -def content_path(content_hash: str) -> str: - """Generate path for content-addressed object.""" - return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" +- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}/` +- Many-to-one: multiple rows can reference same content +- Reference counted for garbage collection +- Deduplication: identical content stored once -# Example: hash "a1b2c3d4..." -> "_content/a1/b2/a1b2c3d4..." ``` - -### Reference Registry - -A schema-level table tracks content-addressed objects for reference counting: - -```python -class ContentRegistry: - """ - Tracks content-addressed objects for garbage collection. - One per schema, created automatically when content-addressed types are used. - """ - definition = """ - # Content-addressed object registry - content_hash : char(64) # SHA256 hex - --- - store : varchar(64) # Store name - size : bigint unsigned # Object size in bytes - created : timestamp DEFAULT CURRENT_TIMESTAMP - """ +store_root/ +├── {schema}/{table}/{pk}/ # object storage (path-addressed) +│ └── {attribute}/ +│ +└── _content/ # content storage (content-addressed) + └── {hash[:2]}/{hash[2:4]}/{hash}/ ``` -### Reference Counting +#### Content Type Behavior -Reference counting is implicit via database queries: +The `content` core type: +- Accepts `bytes` on insert +- Computes SHA256 hash of the content +- Stores in `_content/{hash}/` if not already present (deduplication) +- Returns `bytes` on fetch (transparent retrieval) +- Registers in `ContentRegistry` for GC tracking ```python -def find_orphans(schema) -> list[tuple[str, str]]: - """Find content hashes not referenced by any table.""" +# Core type behavior (built-in, not an AttributeType) +class ContentType: + """Core content-addressed storage type.""" - # Get all registered hashes - registered = set(ContentRegistry().fetch('content_hash', 'store')) + def store(self, data: bytes, store_backend) -> str: + """Store content, return hash.""" + content_hash = hashlib.sha256(data).hexdigest() + path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - # Get all referenced hashes from tables - referenced = set() - for table in schema.tables: - for attr in table.heading.attributes: - if attr.is_content_addressed: - hashes = table.fetch(attr.name) - referenced.update((h, attr.store) for h in hashes) + if not store_backend.exists(path): + store_backend.put(path, data) + ContentRegistry().insert1({ + 'content_hash': content_hash, + 'store': store_backend.name, + 'size': len(data) + }) - return registered - referenced + return content_hash -def garbage_collect(schema): - """Remove orphaned content-addressed objects.""" - for content_hash, store in find_orphans(schema): - # Delete from storage - store_backend = get_store(store) - store_backend.delete(content_path(content_hash)) - # Delete from registry - (ContentRegistry() & {'content_hash': content_hash}).delete() + def retrieve(self, content_hash: str, store_backend) -> bytes: + """Retrieve content by hash.""" + path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + return store_backend.get(path) ``` -### Transparent Access for Content-Addressed Objects - -Content-addressed objects return the same types as their internal counterparts: - -```python -row = (ProcessedData & key).fetch1() - -# returns Python object (like ) -features = row['features'] # dict, array, etc. - fetched and deserialized +#### Database Column -# returns local file path (like ) -local_path = row['source_file'] # '/downloads/data.csv' - downloaded automatically +The `content` type stores a `char(64)` hash in the database: -# Only object@store returns ObjectRef for explicit lazy access -ref = row['results'] # ObjectRef - user controls when to download +```sql +-- content column +features CHAR(64) NOT NULL -- SHA256 hex hash ``` -This makes external storage transparent - users work with Python objects and file paths, -not storage references. The `@store` suffix only affects where data is stored, not how -it's accessed. - -## AttributeType Implementations +## AttributeTypes (Built on Core Types) ### `` - Internal Serialized Blob +Serialized Python object stored in database. + ```python @dj.register_type class DJBlobType(AttributeType): type_name = "djblob" - dtype = "longblob" + dtype = "longblob" # MySQL type def encode(self, value, *, key=None) -> bytes: from . import blob @@ -158,42 +113,42 @@ class DJBlobType(AttributeType): return blob.unpack(stored) ``` -### `` - External Serialized Blob (Content-Addressed) +### `` / `` - External Serialized Blob + +Serialized Python object stored in content-addressed storage. ```python @dj.register_type -class DJBlobExternalType(AttributeType): - type_name = "djblob" - dtype = "char(64)" # Content hash stored in column - is_content_addressed = True +class XBlobType(AttributeType): + type_name = "xblob" + dtype = "content" # Core type - uses default store + # dtype = "content@store" for specific store - def encode(self, value, *, key=None, store=None) -> str: + def encode(self, value, *, key=None) -> bytes: from . import blob - data = blob.pack(value, compress=True) - content_hash = hashlib.sha256(data).hexdigest() - - # Upload if not exists (deduplication) - path = content_path(content_hash) - if not store.exists(path): - store.put(path, data) - ContentRegistry().insert1({ - 'content_hash': content_hash, - 'store': store.name, - 'size': len(data) - }) - - return content_hash + return blob.pack(value, compress=True) - def decode(self, content_hash, *, key=None, store=None) -> Any: - # Fetch and deserialize - transparent to user + def decode(self, stored, *, key=None) -> Any: from . import blob - path = content_path(content_hash) - data = store.get(path) - return blob.unpack(data) + return blob.unpack(stored) +``` + +Usage: +```python +class ProcessedData(dj.Computed): + definition = """ + -> RawData + --- + small_result : # internal (in database) + large_result : # external (default store) + archive_result : # external (specific store) + """ ``` ### `` - Internal File Attachment +File stored in database with filename preserved. + ```python @dj.register_type class AttachType(AttributeType): @@ -213,107 +168,134 @@ class AttachType(AttributeType): return str(download_path) ``` -### `` - External File Attachment (Content-Addressed) +### `` / `` - External File Attachment + +File stored in content-addressed storage with filename preserved. ```python @dj.register_type -class AttachExternalType(AttributeType): - type_name = "attach" - dtype = "char(64)" # Content hash stored in column - is_content_addressed = True +class XAttachType(AttributeType): + type_name = "xattach" + dtype = "content" # Core type - def encode(self, filepath, *, key=None, store=None) -> str: + def encode(self, filepath, *, key=None) -> bytes: path = Path(filepath) - data = path.read_bytes() - # Hash includes filename for uniqueness - content_hash = hashlib.sha256( - path.name.encode() + b"\0" + data - ).hexdigest() - - # Store with original filename preserved - obj_path = content_path(content_hash) - if not store.exists(obj_path): - store.put(f"{obj_path}/{path.name}", data) - ContentRegistry().insert1({ - 'content_hash': content_hash, - 'store': store.name, - 'size': len(data) - }) - - return content_hash + # Include filename in stored data + return path.name.encode() + b"\0" + path.read_bytes() - def decode(self, content_hash, *, key=None, store=None) -> str: - # Download and return local path - transparent to user - obj_path = content_path(content_hash) - # List to get filename (stored as {hash}/{filename}) - filename = store.list(obj_path)[0] + def decode(self, stored, *, key=None) -> str: + filename, contents = stored.split(b"\0", 1) + filename = filename.decode() download_path = Path(dj.config['download_path']) / filename download_path.parent.mkdir(parents=True, exist_ok=True) - store.download(f"{obj_path}/{filename}", download_path) + download_path.write_bytes(contents) return str(download_path) ``` -## ObjectRef Interface (for `object@store` only) +Usage: +```python +class Attachments(dj.Manual): + definition = """ + attachment_id : int + --- + config : # internal (small file in DB) + data_file : # external (default store) + archive : # external (specific store) + """ +``` -Only `object@store` returns `ObjectRef` for explicit lazy access. This is intentional - -large files and folders (Zarr, HDF5, etc.) benefit from user-controlled download/access. +## Type Layering Summary -```python -class ObjectRef: - """Lazy reference to stored object (object@store only).""" - - def __init__(self, path, store): - self.path = path - self.store = store - - def download(self, local_path=None) -> Path: - """Download object to local filesystem.""" - if local_path is None: - local_path = Path(dj.config['download_path']) / Path(self.path).name - self.store.download(self.path, local_path) - return local_path - - def open(self, mode='rb'): - """Open via fsspec for streaming/direct access.""" - return self.store.open(self.path, mode) - - def exists(self) -> bool: - """Check if object exists in store.""" - return self.store.exists(self.path) +``` +┌─────────────────────────────────────────────────────────────┐ +│ AttributeTypes │ +│ │ +├─────────────────────────────────────────────────────────────┤ +│ Core DataJoint Types │ +│ longblob content object │ +│ content@store object@store │ +├─────────────────────────────────────────────────────────────┤ +│ MySQL Types │ +│ LONGBLOB CHAR(64) JSON VARCHAR INT etc. │ +└─────────────────────────────────────────────────────────────┘ ``` -## Summary +## Storage Comparison -| Type | Storage | Column | Dedup | Returns | -|------|---------|--------|-------|---------| -| `object@store` | `{schema}/{table}/{pk}/` | JSON | No | ObjectRef (lazy) | -| `` | Internal DB | LONGBLOB | No | Python object | -| `` | `_content/{hash}/` | char(64) | Yes | Python object | -| `` | Internal DB | LONGBLOB | No | Local file path | -| `` | `_content/{hash}/` | char(64) | Yes | Local file path | +| AttributeType | Core Type | Storage Location | Dedup | Returns | +|---------------|-----------|------------------|-------|---------| +| `` | `longblob` | Database | No | Python object | +| `` | `content` | `_content/{hash}/` | Yes | Python object | +| `` | `content@store` | `_content/{hash}/` | Yes | Python object | +| `` | `longblob` | Database | No | Local file path | +| `` | `content` | `_content/{hash}/` | Yes | Local file path | +| `` | `content@store` | `_content/{hash}/` | Yes | Local file path | +| — | `object` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| — | `object@store` | `{schema}/{table}/{pk}/` | No | ObjectRef | -## Key Design Decisions +## Reference Counting for Content Type + +The `ContentRegistry` table tracks content-addressed objects: + +```python +class ContentRegistry: + definition = """ + # Content-addressed object registry + content_hash : char(64) # SHA256 hex + --- + store : varchar(64) # Store name + size : bigint unsigned # Size in bytes + created : timestamp DEFAULT CURRENT_TIMESTAMP + """ +``` -1. **Unified OAS paradigm**: All external storage uses OAS infrastructure -2. **Content-addressed region**: `_content/` folder for deduplicated objects -3. **Reference counting**: Via `ContentRegistry` table + query-based orphan detection -4. **Transparent access**: `` and `` return same types as internal variants -5. **Lazy access for objects**: Only `object@store` returns ObjectRef (for large files/folders) -6. **Deduplication**: Content hash determines identity; identical content stored once +Garbage collection finds orphaned content: -## Migration from Legacy `~external_*` +```python +def garbage_collect(schema): + """Remove content not referenced by any table.""" + # Get all registered hashes + registered = set(ContentRegistry().fetch('content_hash', 'store')) + + # Get all referenced hashes from tables with content-type columns + referenced = set() + for table in schema.tables: + for attr in table.heading.attributes: + if attr.type in ('content', 'content@...'): + hashes = table.fetch(attr.name) + referenced.update((h, attr.store) for h in hashes) + + # Delete orphaned content + for content_hash, store in (registered - referenced): + store_backend = get_store(store) + store_backend.delete(content_path(content_hash)) + (ContentRegistry() & {'content_hash': content_hash}).delete() +``` -For existing schemas with `~external_*` tables: +## Key Design Decisions -1. Read legacy external references -2. Re-upload to `_content/` region -3. Update column values to content hashes -4. Drop `~external_*` tables -5. Create `ContentRegistry` entries +1. **Layered architecture**: Core types (`content`, `object`) separate from AttributeTypes +2. **Content type**: New core type for content-addressed, deduplicated storage +3. **Naming convention**: + - `` = internal serialized (database) + - `` = external serialized (content-addressed) + - `` = internal file + - `` = external file +4. **Transparent access**: AttributeTypes return Python objects or file paths, not references +5. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef + +## Migration from Legacy Types + +| Legacy | New Equivalent | +|--------|----------------| +| `longblob` (auto-serialized) | `` | +| `blob@store` | `` | +| `attach` | `` | +| `attach@store` | `` | +| `filepath@store` | Deprecated (use `object@store` or ``) | ## Open Questions -1. **Hash collision**: SHA256 is effectively collision-free, but should we verify on fetch? -2. **Partial uploads**: How to handle interrupted uploads? Temp path then rename? -3. **Cross-schema deduplication**: Should `_content/` be per-schema or global? -4. **Backward compat**: How long to support reading from legacy `~external_*`? +1. Should `content` without `@store` use a default store, or require explicit store? +2. Should we support `` without `@store` syntax (implying default store)? +3. Should `filepath@store` be kept for backward compat or fully deprecated? From 6fcc4d310b1996f8734f6a2bb898f4be817f41c4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:50:31 +0000 Subject: [PATCH 15/42] Add parameterized AttributeTypes and content vs object comparison - content type is single-blob only (no folders) - Parameterized syntax: passes param to dtype - Add content vs object comparison table - Clarify when to use each type Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 57 +++++++++++++++++--- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 3b48bb50a..09e3ebecf 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -34,10 +34,12 @@ class Analysis(dj.Computed): **New core type.** Content-addressed storage with deduplication: -- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}/` +- **Single blob only**: stores a single file or serialized object (not folders) +- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}` - Many-to-one: multiple rows can reference same content - Reference counted for garbage collection - Deduplication: identical content stored once +- For folders/complex objects, use `object` type instead ``` store_root/ @@ -92,6 +94,31 @@ The `content` type stores a `char(64)` hash in the database: features CHAR(64) NOT NULL -- SHA256 hex hash ``` +## Parameterized AttributeTypes + +AttributeTypes can be parameterized with `` syntax. The parameter is passed +through to the underlying dtype: + +```python +class AttributeType: + type_name: str # Name used in + dtype: str # Base underlying type + + # When user writes , resolved dtype becomes: + # f"{dtype}@{param}" if param specified, else dtype +``` + +**Resolution examples:** +``` + → dtype = "content" → default store + → dtype = "content@cold" → cold store + → dtype = "longblob" → database + → ERROR: longblob doesn't support parameters +``` + +This means `` and `` share the same AttributeType class - the +parameter flows through to the core type, which validates whether it supports `@store`. + ## AttributeTypes (Built on Core Types) ### `` - Internal Serialized Blob @@ -272,17 +299,33 @@ def garbage_collect(schema): (ContentRegistry() & {'content_hash': content_hash}).delete() ``` +## Content vs Object: When to Use Each + +| Feature | `content` | `object` | +|---------|-----------|----------| +| Addressing | Content hash (SHA256) | Path (from primary key) | +| Deduplication | Yes | No | +| Structure | Single blob only | Files, folders, Zarr, HDF5 | +| Access | Transparent (returns bytes) | Lazy (returns ObjectRef) | +| GC | Reference counted | Deleted with row | +| Use case | Serialized data, file attachments | Large/complex objects, streaming | + +**Rule of thumb:** +- Need deduplication or storing serialized Python objects? → `content` via `` +- Need folders, Zarr, HDF5, or streaming access? → `object` + ## Key Design Decisions 1. **Layered architecture**: Core types (`content`, `object`) separate from AttributeTypes -2. **Content type**: New core type for content-addressed, deduplicated storage -3. **Naming convention**: +2. **Content type**: Single-blob, content-addressed, deduplicated storage +3. **Parameterized types**: `` passes parameter to underlying dtype +4. **Naming convention**: - `` = internal serialized (database) - `` = external serialized (content-addressed) - - `` = internal file - - `` = external file -4. **Transparent access**: AttributeTypes return Python objects or file paths, not references -5. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef + - `` = internal file (single file) + - `` = external file (single file) +5. **Transparent access**: AttributeTypes return Python objects or file paths, not references +6. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef ## Migration from Legacy Types From b87342bddc4afe1d7ba14ed863ee2af08825f30f Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 05:53:34 +0000 Subject: [PATCH 16/42] Make content storage per-project and add migration utility - Content-addressed storage is now per-project (not per-schema) - Deduplication works across all schemas in a project - ContentRegistry is project-level (e.g., {project}_content database) - GC scans all schemas in project for references - Add migration utility for legacy ~external_* per-schema stores - Document migration from binary(16) UUID to char(64) SHA256 hash Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 90 +++++++++++++++++--- 1 file changed, 77 insertions(+), 13 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 09e3ebecf..381cbf1c5 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -35,10 +35,11 @@ class Analysis(dj.Computed): **New core type.** Content-addressed storage with deduplication: - **Single blob only**: stores a single file or serialized object (not folders) +- **Per-project scope**: content is shared across all schemas in a project (not per-schema) - Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}` -- Many-to-one: multiple rows can reference same content +- Many-to-one: multiple rows (even across schemas) can reference same content - Reference counted for garbage collection -- Deduplication: identical content stored once +- Deduplication: identical content stored once across the entire project - For folders/complex objects, use `object` type instead ``` @@ -262,12 +263,17 @@ class Attachments(dj.Manual): ## Reference Counting for Content Type -The `ContentRegistry` table tracks content-addressed objects: +The `ContentRegistry` is a **project-level** table that tracks content-addressed objects +across all schemas. This differs from the legacy `~external_*` tables which were per-schema. ```python class ContentRegistry: + """ + Project-level content registry. + Stored in a designated database (e.g., `{project}_content`). + """ definition = """ - # Content-addressed object registry + # Content-addressed object registry (project-wide) content_hash : char(64) # SHA256 hex --- store : varchar(64) # Store name @@ -276,21 +282,22 @@ class ContentRegistry: """ ``` -Garbage collection finds orphaned content: +Garbage collection scans **all schemas** in the project: ```python -def garbage_collect(schema): - """Remove content not referenced by any table.""" +def garbage_collect(project): + """Remove content not referenced by any table in any schema.""" # Get all registered hashes registered = set(ContentRegistry().fetch('content_hash', 'store')) - # Get all referenced hashes from tables with content-type columns + # Get all referenced hashes from ALL schemas in the project referenced = set() - for table in schema.tables: - for attr in table.heading.attributes: - if attr.type in ('content', 'content@...'): - hashes = table.fetch(attr.name) - referenced.update((h, attr.store) for h in hashes) + for schema in project.schemas: + for table in schema.tables: + for attr in table.heading.attributes: + if attr.type in ('content', 'content@...'): + hashes = table.fetch(attr.name) + referenced.update((h, attr.store) for h in hashes) # Delete orphaned content for content_hash, store in (registered - referenced): @@ -337,8 +344,65 @@ def garbage_collect(schema): | `attach@store` | `` | | `filepath@store` | Deprecated (use `object@store` or ``) | +### Migration from Legacy `~external_*` Stores + +Legacy external storage used per-schema `~external_{store}` tables. Migration to the new +per-project `ContentRegistry` requires: + +```python +def migrate_external_store(schema, store_name): + """ + Migrate legacy ~external_{store} to new ContentRegistry. + + 1. Read all entries from ~external_{store} + 2. For each entry: + - Fetch content from legacy location + - Compute SHA256 hash + - Copy to _content/{hash}/ if not exists + - Update table column from UUID to hash + - Register in ContentRegistry + 3. After all schemas migrated, drop ~external_{store} tables + """ + external_table = schema.external[store_name] + + for entry in external_table.fetch(as_dict=True): + legacy_uuid = entry['hash'] + + # Fetch content from legacy location + content = external_table.get(legacy_uuid) + + # Compute new content hash + content_hash = hashlib.sha256(content).hexdigest() + + # Store in new location if not exists + new_path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + store = get_store(store_name) + if not store.exists(new_path): + store.put(new_path, content) + + # Register in project-wide ContentRegistry + ContentRegistry().insert1({ + 'content_hash': content_hash, + 'store': store_name, + 'size': len(content) + }, skip_duplicates=True) + + # Update referencing tables (UUID -> hash) + # ... update all tables that reference this UUID ... + + # After migration complete for all schemas: + # DROP TABLE `{schema}`.`~external_{store}` +``` + +**Migration considerations:** +- Legacy UUIDs were based on content hash but stored as `binary(16)` +- New system uses `char(64)` SHA256 hex strings +- Migration can be done incrementally per schema +- Backward compatibility layer can read both formats during transition + ## Open Questions 1. Should `content` without `@store` use a default store, or require explicit store? 2. Should we support `` without `@store` syntax (implying default store)? 3. Should `filepath@store` be kept for backward compat or fully deprecated? +4. How long should the backward compatibility layer support legacy `~external_*` format? From 40c1dbbca99517ca8a64bdf41a4994219addd31d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:01:02 +0000 Subject: [PATCH 17/42] Add filepath as third OAS region with ObjectRef interface Three OAS storage regions: 1. object: {schema}/{table}/{pk}/ - PK-addressed, DataJoint controls 2. content: _content/{hash} - content-addressed, deduplicated 3. filepath: _files/{user-path} - user-addressed, user controls Upgraded filepath@store: - Returns ObjectRef (lazy) instead of copying files - Supports streaming via ref.open() - Supports folders (like object) - Stores checksum in JSON column for verification - No more automatic copy to local stage Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 188 ++++++++++++++----- 1 file changed, 145 insertions(+), 43 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 381cbf1c5..7ca4522c6 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -5,9 +5,17 @@ This document defines a layered storage architecture: 1. **MySQL types**: `longblob`, `varchar`, `int`, etc. -2. **Core DataJoint types**: `object`, `content` (and their `@store` variants) +2. **Core DataJoint types**: `object`, `content`, `filepath` (and their `@store` variants) 3. **AttributeTypes**: ``, ``, ``, etc. (built on top of core types) +### Three OAS Storage Regions + +| Region | Path Pattern | Addressing | Use Case | +|--------|--------------|------------|----------| +| Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 | +| Content | `_content/{hash}` | Content hash | Deduplicated blobs/files | +| Filepath | `_files/{user-path}` | User-defined | User-organized files | + ## Core Types ### `object` / `object@store` - Path-Addressed Storage @@ -44,11 +52,14 @@ class Analysis(dj.Computed): ``` store_root/ -├── {schema}/{table}/{pk}/ # object storage (path-addressed) +├── {schema}/{table}/{pk}/ # object storage (path-addressed by PK) │ └── {attribute}/ │ -└── _content/ # content storage (content-addressed) - └── {hash[:2]}/{hash[2:4]}/{hash}/ +├── _content/ # content storage (content-addressed) +│ └── {hash[:2]}/{hash[2:4]}/{hash} +│ +└── _files/ # filepath storage (user-addressed) + └── {user-defined-path} ``` #### Content Type Behavior @@ -95,6 +106,92 @@ The `content` type stores a `char(64)` hash in the database: features CHAR(64) NOT NULL -- SHA256 hex hash ``` +### `filepath` / `filepath@store` - User-Addressed Storage + +**Upgraded from legacy.** User-defined path organization with ObjectRef access: + +- **User controls paths**: relative path specified by user (not derived from PK or hash) +- Stored in `_files/{user-path}` within the store +- Returns `ObjectRef` for lazy access (no automatic copying) +- Stores checksum in database for verification +- Supports files and folders (like `object`) + +```python +class RawData(dj.Manual): + definition = """ + session_id : int + --- + recording : filepath@raw # user specifies path + """ + +# Insert - user provides relative path +table.insert1({ + 'session_id': 1, + 'recording': 'experiment_001/session_001/data.nwb' +}) + +# Fetch - returns ObjectRef (lazy, no copy) +row = (table & 'session_id=1').fetch1() +ref = row['recording'] # ObjectRef +ref.download('/local/path') # explicit download +ref.open() # fsspec streaming access +``` + +#### Filepath Type Behavior + +```python +# Core type behavior +class FilepathType: + """Core user-addressed storage type.""" + + def store(self, user_path: str, store_backend) -> dict: + """ + Register filepath, return metadata. + File must already exist at _files/{user_path} in store. + """ + full_path = f"_files/{user_path}" + if not store_backend.exists(full_path): + raise FileNotFoundError(f"File not found: {full_path}") + + # Compute checksum for verification + checksum = store_backend.checksum(full_path) + size = store_backend.size(full_path) + + return { + 'path': user_path, + 'checksum': checksum, + 'size': size + } + + def retrieve(self, metadata: dict, store_backend) -> ObjectRef: + """Return ObjectRef for lazy access.""" + return ObjectRef( + path=f"_files/{metadata['path']}", + store=store_backend, + checksum=metadata.get('checksum') # for verification + ) +``` + +#### Database Column + +The `filepath` type stores JSON metadata: + +```sql +-- filepath column +recording JSON NOT NULL +-- Contains: {"path": "...", "checksum": "...", "size": ...} +``` + +#### Key Differences from Legacy `filepath@store` + +| Feature | Legacy | New | +|---------|--------|-----| +| Access | Copy to local stage | ObjectRef (lazy) | +| Copying | Automatic | Explicit via `ref.download()` | +| Streaming | No | Yes via `ref.open()` | +| Folders | No | Yes | +| Interface | Returns local path | Returns ObjectRef | + ## Parameterized AttributeTypes AttributeTypes can be parameterized with `` syntax. The parameter is passed @@ -235,31 +332,32 @@ class Attachments(dj.Manual): ## Type Layering Summary ``` -┌─────────────────────────────────────────────────────────────┐ -│ AttributeTypes │ -│ │ -├─────────────────────────────────────────────────────────────┤ -│ Core DataJoint Types │ -│ longblob content object │ -│ content@store object@store │ -├─────────────────────────────────────────────────────────────┤ -│ MySQL Types │ -│ LONGBLOB CHAR(64) JSON VARCHAR INT etc. │ -└─────────────────────────────────────────────────────────────┘ +┌───────────────────────────────────────────────────────────────────┐ +│ AttributeTypes │ +│ │ +├───────────────────────────────────────────────────────────────────┤ +│ Core DataJoint Types │ +│ longblob content object filepath │ +│ content@s object@s filepath@s │ +├───────────────────────────────────────────────────────────────────┤ +│ MySQL Types │ +│ LONGBLOB CHAR(64) JSON JSON VARCHAR etc. │ +└───────────────────────────────────────────────────────────────────┘ ``` ## Storage Comparison -| AttributeType | Core Type | Storage Location | Dedup | Returns | -|---------------|-----------|------------------|-------|---------| +| Type | Core Type | Storage Location | Dedup | Returns | +|------|-----------|------------------|-------|---------| | `` | `longblob` | Database | No | Python object | -| `` | `content` | `_content/{hash}/` | Yes | Python object | -| `` | `content@store` | `_content/{hash}/` | Yes | Python object | +| `` | `content` | `_content/{hash}` | Yes | Python object | +| `` | `content@s` | `_content/{hash}` | Yes | Python object | | `` | `longblob` | Database | No | Local file path | -| `` | `content` | `_content/{hash}/` | Yes | Local file path | -| `` | `content@store` | `_content/{hash}/` | Yes | Local file path | -| — | `object` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| — | `object@store` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `content` | `_content/{hash}` | Yes | Local file path | +| `` | `content@s` | `_content/{hash}` | Yes | Local file path | +| `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `filepath@s` | — | `_files/{user-path}` | No | ObjectRef | ## Reference Counting for Content Type @@ -306,33 +404,37 @@ def garbage_collect(project): (ContentRegistry() & {'content_hash': content_hash}).delete() ``` -## Content vs Object: When to Use Each +## Core Type Comparison -| Feature | `content` | `object` | -|---------|-----------|----------| -| Addressing | Content hash (SHA256) | Path (from primary key) | -| Deduplication | Yes | No | -| Structure | Single blob only | Files, folders, Zarr, HDF5 | -| Access | Transparent (returns bytes) | Lazy (returns ObjectRef) | -| GC | Reference counted | Deleted with row | -| Use case | Serialized data, file attachments | Large/complex objects, streaming | +| Feature | `object` | `content` | `filepath` | +|---------|----------|-----------|------------| +| Addressing | Primary key | Content hash | User-defined path | +| Path control | DataJoint | DataJoint | User | +| Deduplication | No | Yes | No | +| Structure | Files, folders, Zarr | Single blob only | Files, folders | +| Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) | +| GC | Deleted with row | Reference counted | Deleted with row | +| Checksum | Optional | Implicit (is the hash) | Stored in DB | -**Rule of thumb:** -- Need deduplication or storing serialized Python objects? → `content` via `` -- Need folders, Zarr, HDF5, or streaming access? → `object` +**When to use each:** +- **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) +- **`content`**: Deduplicated serialized data or file attachments via ``, `` +- **`filepath`**: User-managed file organization, external data sources ## Key Design Decisions -1. **Layered architecture**: Core types (`content`, `object`) separate from AttributeTypes -2. **Content type**: Single-blob, content-addressed, deduplicated storage -3. **Parameterized types**: `` passes parameter to underlying dtype -4. **Naming convention**: +1. **Layered architecture**: Core types (`object`, `content`, `filepath`) separate from AttributeTypes +2. **Three OAS regions**: object (PK-addressed), content (hash-addressed), filepath (user-addressed) +3. **Content type**: Single-blob, content-addressed, deduplicated storage +4. **Filepath upgrade**: Returns ObjectRef (lazy) instead of copying files +5. **Parameterized types**: `` passes parameter to underlying dtype +6. **Naming convention**: - `` = internal serialized (database) - `` = external serialized (content-addressed) - `` = internal file (single file) - `` = external file (single file) -5. **Transparent access**: AttributeTypes return Python objects or file paths, not references -6. **Lazy access for objects**: Only `object`/`object@store` returns ObjectRef +7. **Transparent access**: AttributeTypes return Python objects or file paths +8. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef ## Migration from Legacy Types @@ -342,7 +444,7 @@ def garbage_collect(project): | `blob@store` | `` | | `attach` | `` | | `attach@store` | `` | -| `filepath@store` | Deprecated (use `object@store` or ``) | +| `filepath@store` (copy-based) | `filepath@store` (ObjectRef-based, upgraded) | ### Migration from Legacy `~external_*` Stores @@ -404,5 +506,5 @@ def migrate_external_store(schema, store_name): 1. Should `content` without `@store` use a default store, or require explicit store? 2. Should we support `` without `@store` syntax (implying default store)? -3. Should `filepath@store` be kept for backward compat or fully deprecated? +3. Should `filepath` without `@store` be supported (using default store)? 4. How long should the backward compatibility layer support legacy `~external_*` format? From dbf092d76ed7d1f206453a70882da68d2bbbc3cf Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:06:06 +0000 Subject: [PATCH 18/42] Redesign filepath as URI reference tracker and add json core type filepath changes: - No longer an OAS region - tracks external URIs anywhere - Supports any fsspec-compatible URI (s3://, https://, gs://, etc.) - Returns ObjectRef for lazy access via fsspec - No integrity guarantees (external resources may change) - Uses json core type for storage json core type: - Cross-database compatible (MySQL JSON, PostgreSQL JSONB) - Used by filepath and object types Two OAS regions remain: - object: PK-addressed, DataJoint controlled - content: hash-addressed, deduplicated Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 168 ++++++++++++------- 1 file changed, 106 insertions(+), 62 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 7ca4522c6..b4b149628 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -4,17 +4,24 @@ This document defines a layered storage architecture: -1. **MySQL types**: `longblob`, `varchar`, `int`, etc. -2. **Core DataJoint types**: `object`, `content`, `filepath` (and their `@store` variants) +1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc. +2. **Core DataJoint types**: `object`, `content`, `filepath`, `json` (and `@store` variants where applicable) 3. **AttributeTypes**: ``, ``, ``, etc. (built on top of core types) -### Three OAS Storage Regions +### OAS Storage Regions | Region | Path Pattern | Addressing | Use Case | |--------|--------------|------------|----------| | Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 | | Content | `_content/{hash}` | Content hash | Deduplicated blobs/files | -| Filepath | `_files/{user-path}` | User-defined | User-organized files | + +### External References + +`filepath` is **not** an OAS region - it's a general reference tracker for external resources: +- OAS store paths: `store://main/experiment/data.h5` +- URLs: `https://example.com/dataset.zip` +- S3: `s3://bucket/key/file.nwb` +- Any fsspec-compatible URI ## Core Types @@ -55,11 +62,8 @@ store_root/ ├── {schema}/{table}/{pk}/ # object storage (path-addressed by PK) │ └── {attribute}/ │ -├── _content/ # content storage (content-addressed) -│ └── {hash[:2]}/{hash[2:4]}/{hash} -│ -└── _files/ # filepath storage (user-addressed) - └── {user-defined-path} +└── _content/ # content storage (content-addressed) + └── {hash[:2]}/{hash[2:4]}/{hash} ``` #### Content Type Behavior @@ -106,31 +110,41 @@ The `content` type stores a `char(64)` hash in the database: features CHAR(64) NOT NULL -- SHA256 hex hash ``` -### `filepath` / `filepath@store` - User-Addressed Storage +### `filepath` - External Reference Tracker -**Upgraded from legacy.** User-defined path organization with ObjectRef access: +**Upgraded from legacy.** General-purpose reference tracker for external resources: -- **User controls paths**: relative path specified by user (not derived from PK or hash) -- Stored in `_files/{user-path}` within the store -- Returns `ObjectRef` for lazy access (no automatic copying) -- Stores checksum in database for verification -- Supports files and folders (like `object`) +- **Not an OAS region**: references can point anywhere (URLs, S3, OAS stores, etc.) +- **User controls URIs**: any fsspec-compatible URI +- Returns `ObjectRef` for lazy access via fsspec +- Stores optional checksum for verification +- No integrity guarantees (external resources may change/disappear) ```python class RawData(dj.Manual): definition = """ session_id : int --- - recording : filepath@raw # user specifies path + recording : filepath # external reference """ -# Insert - user provides relative path +# Insert - user provides URI (various protocols) table.insert1({ 'session_id': 1, - 'recording': 'experiment_001/session_001/data.nwb' + 'recording': 's3://my-bucket/experiment_001/data.nwb' +}) +# Or URL +table.insert1({ + 'session_id': 2, + 'recording': 'https://example.com/public/dataset.h5' +}) +# Or OAS store reference +table.insert1({ + 'session_id': 3, + 'recording': 'store://main/custom/path/file.zarr' }) -# Fetch - returns ObjectRef (lazy, no copy) +# Fetch - returns ObjectRef (lazy) row = (table & 'session_id=1').fetch1() ref = row['recording'] # ObjectRef ref.download('/local/path') # explicit download @@ -142,55 +156,82 @@ ref.open() # fsspec streaming access ```python # Core type behavior class FilepathType: - """Core user-addressed storage type.""" + """Core external reference type.""" - def store(self, user_path: str, store_backend) -> dict: + def store(self, uri: str, compute_checksum: bool = False) -> dict: """ - Register filepath, return metadata. - File must already exist at _files/{user_path} in store. + Register external reference, return metadata. + Optionally compute checksum for verification. """ - full_path = f"_files/{user_path}" - if not store_backend.exists(full_path): - raise FileNotFoundError(f"File not found: {full_path}") + metadata = {'uri': uri} - # Compute checksum for verification - checksum = store_backend.checksum(full_path) - size = store_backend.size(full_path) + if compute_checksum: + # Use fsspec to access and compute checksum + fs, path = fsspec.core.url_to_fs(uri) + if fs.exists(path): + metadata['checksum'] = compute_file_checksum(fs, path) + metadata['size'] = fs.size(path) - return { - 'path': user_path, - 'checksum': checksum, - 'size': size - } + return metadata - def retrieve(self, metadata: dict, store_backend) -> ObjectRef: + def retrieve(self, metadata: dict) -> ObjectRef: """Return ObjectRef for lazy access.""" return ObjectRef( - path=f"_files/{metadata['path']}", - store=store_backend, - checksum=metadata.get('checksum') # for verification + uri=metadata['uri'], + checksum=metadata.get('checksum') # optional verification ) ``` #### Database Column -The `filepath` type stores JSON metadata: +The `filepath` type uses the `json` core type: ```sql --- filepath column +-- filepath column (MySQL) recording JSON NOT NULL --- Contains: {"path": "...", "checksum": "...", "size": ...} +-- Contains: {"uri": "s3://...", "checksum": "...", "size": ...} + +-- filepath column (PostgreSQL) +recording JSONB NOT NULL ``` +#### Supported URI Schemes + +| Scheme | Example | Backend | +|--------|---------|---------| +| `s3://` | `s3://bucket/key/file.nwb` | S3 via fsspec | +| `gs://` | `gs://bucket/object` | Google Cloud Storage | +| `https://` | `https://example.com/data.h5` | HTTP(S) | +| `file://` | `file:///local/path/data.csv` | Local filesystem | +| `store://` | `store://main/path/file.zarr` | OAS store | + #### Key Differences from Legacy `filepath@store` | Feature | Legacy | New | |---------|--------|-----| +| Location | OAS store only | Any URI (S3, HTTP, etc.) | | Access | Copy to local stage | ObjectRef (lazy) | | Copying | Automatic | Explicit via `ref.download()` | | Streaming | No | Yes via `ref.open()` | -| Folders | No | Yes | -| Interface | Returns local path | Returns ObjectRef | +| Integrity | Managed by DataJoint | External (may change) | +| Store param | Required (`@store`) | Optional (embedded in URI) | + +### `json` - Cross-Database JSON Type + +**New core type.** JSON storage compatible across MySQL and PostgreSQL: + +```sql +-- MySQL +column_name JSON NOT NULL + +-- PostgreSQL +column_name JSONB NOT NULL +``` + +The `json` core type: +- Stores arbitrary JSON-serializable data +- Automatically uses appropriate type for database backend +- Supports JSON path queries where available ## Parameterized AttributeTypes @@ -337,11 +378,12 @@ class Attachments(dj.Manual): │ │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types │ -│ longblob content object filepath │ -│ content@s object@s filepath@s │ +│ longblob content object filepath json │ +│ content@s object@s │ ├───────────────────────────────────────────────────────────────────┤ -│ MySQL Types │ -│ LONGBLOB CHAR(64) JSON JSON VARCHAR etc. │ +│ Database Types │ +│ LONGBLOB CHAR(64) JSON JSON/JSONB VARCHAR etc. │ +│ (MySQL) (PostgreSQL) │ └───────────────────────────────────────────────────────────────────┘ ``` @@ -357,7 +399,7 @@ class Attachments(dj.Manual): | `` | `content@s` | `_content/{hash}` | Yes | Local file path | | `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | | `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `filepath@s` | — | `_files/{user-path}` | No | ObjectRef | +| `filepath` | `json` | External (any URI) | No | ObjectRef | ## Reference Counting for Content Type @@ -408,33 +450,35 @@ def garbage_collect(project): | Feature | `object` | `content` | `filepath` | |---------|----------|-----------|------------| -| Addressing | Primary key | Content hash | User-defined path | +| Location | OAS store | OAS store | Anywhere (URI) | +| Addressing | Primary key | Content hash | User URI | | Path control | DataJoint | DataJoint | User | | Deduplication | No | Yes | No | -| Structure | Files, folders, Zarr | Single blob only | Files, folders | +| Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) | | Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) | -| GC | Deleted with row | Reference counted | Deleted with row | -| Checksum | Optional | Implicit (is the hash) | Stored in DB | +| GC | Deleted with row | Reference counted | N/A (external) | +| Integrity | DataJoint managed | DataJoint managed | External (no guarantees) | **When to use each:** - **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) - **`content`**: Deduplicated serialized data or file attachments via ``, `` -- **`filepath`**: User-managed file organization, external data sources +- **`filepath`**: External references (S3, URLs, etc.) not managed by DataJoint ## Key Design Decisions -1. **Layered architecture**: Core types (`object`, `content`, `filepath`) separate from AttributeTypes -2. **Three OAS regions**: object (PK-addressed), content (hash-addressed), filepath (user-addressed) -3. **Content type**: Single-blob, content-addressed, deduplicated storage -4. **Filepath upgrade**: Returns ObjectRef (lazy) instead of copying files -5. **Parameterized types**: `` passes parameter to underlying dtype -6. **Naming convention**: +1. **Layered architecture**: Core types (`object`, `content`, `filepath`, `json`) separate from AttributeTypes +2. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +3. **Filepath as reference tracker**: Not an OAS region - tracks external URIs (S3, HTTP, etc.) +4. **Content type**: Single-blob, content-addressed, deduplicated storage +5. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB) +6. **Parameterized types**: `` passes parameter to underlying dtype +7. **Naming convention**: - `` = internal serialized (database) - `` = external serialized (content-addressed) - `` = internal file (single file) - `` = external file (single file) -7. **Transparent access**: AttributeTypes return Python objects or file paths -8. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef +8. **Transparent access**: AttributeTypes return Python objects or file paths +9. **Lazy access**: `object`, `object@store`, and `filepath` return ObjectRef ## Migration from Legacy Types From 43c1999c6792600659bfd55b79501e0323fc7604 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:19:43 +0000 Subject: [PATCH 19/42] Simplify filepath to filepath@store with relative paths for portability - Remove general URI tracker concept from filepath - filepath@store now requires a store parameter and uses relative paths - Key benefit: portability across environments by changing store config - For arbitrary URLs, recommend using varchar (simpler, more transparent) - Add comparison table for filepath@store vs varchar use cases - Update all diagrams and tables to reflect the change Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 130 +++++++++---------- 1 file changed, 60 insertions(+), 70 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index b4b149628..f34d1b84a 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -17,11 +17,8 @@ This document defines a layered storage architecture: ### External References -`filepath` is **not** an OAS region - it's a general reference tracker for external resources: -- OAS store paths: `store://main/experiment/data.h5` -- URLs: `https://example.com/dataset.zip` -- S3: `s3://bucket/key/file.nwb` -- Any fsspec-compatible URI +`filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access. +For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. ## Core Types @@ -110,38 +107,31 @@ The `content` type stores a `char(64)` hash in the database: features CHAR(64) NOT NULL -- SHA256 hex hash ``` -### `filepath` - External Reference Tracker +### `filepath@store` - Portable External Reference -**Upgraded from legacy.** General-purpose reference tracker for external resources: +**Upgraded from legacy.** Relative path references within configured stores: -- **Not an OAS region**: references can point anywhere (URLs, S3, OAS stores, etc.) -- **User controls URIs**: any fsspec-compatible URI +- **Relative paths**: paths within a configured store (portable across environments) +- **Store-aware**: resolves paths against configured store backend - Returns `ObjectRef` for lazy access via fsspec - Stores optional checksum for verification -- No integrity guarantees (external resources may change/disappear) + +**Key benefit**: Portability. The path is relative to the store, so pipelines can be moved +between environments (dev → prod, cloud → local) by changing store configuration without +updating data. ```python class RawData(dj.Manual): definition = """ session_id : int --- - recording : filepath # external reference + recording : filepath@main # relative path within 'main' store """ -# Insert - user provides URI (various protocols) +# Insert - user provides relative path within the store table.insert1({ 'session_id': 1, - 'recording': 's3://my-bucket/experiment_001/data.nwb' -}) -# Or URL -table.insert1({ - 'session_id': 2, - 'recording': 'https://example.com/public/dataset.h5' -}) -# Or OAS store reference -table.insert1({ - 'session_id': 3, - 'recording': 'store://main/custom/path/file.zarr' + 'recording': 'experiment_001/data.nwb' # relative to main store root }) # Fetch - returns ObjectRef (lazy) @@ -151,33 +141,43 @@ ref.download('/local/path') # explicit download ref.open() # fsspec streaming access ``` +#### When to Use `filepath@store` vs `varchar` + +| Use Case | Recommended Type | +|----------|------------------| +| Need ObjectRef/lazy access | `filepath@store` | +| Need portability (relative paths) | `filepath@store` | +| Want checksum verification | `filepath@store` | +| Just storing a URL string | `varchar` | +| External URLs you don't control | `varchar` | + +For arbitrary URLs (S3, HTTP, etc.) where you don't need ObjectRef semantics, +just use `varchar`. A string is simpler and more transparent. + #### Filepath Type Behavior ```python # Core type behavior class FilepathType: - """Core external reference type.""" + """Core external reference type with store-relative paths.""" - def store(self, uri: str, compute_checksum: bool = False) -> dict: - """ - Register external reference, return metadata. - Optionally compute checksum for verification. - """ - metadata = {'uri': uri} + def store(self, relative_path: str, store_backend, compute_checksum: bool = False) -> dict: + """Register reference to file in store.""" + metadata = {'path': relative_path} if compute_checksum: - # Use fsspec to access and compute checksum - fs, path = fsspec.core.url_to_fs(uri) - if fs.exists(path): - metadata['checksum'] = compute_file_checksum(fs, path) - metadata['size'] = fs.size(path) + full_path = store_backend.resolve(relative_path) + if store_backend.exists(full_path): + metadata['checksum'] = compute_file_checksum(store_backend, full_path) + metadata['size'] = store_backend.size(full_path) return metadata - def retrieve(self, metadata: dict) -> ObjectRef: + def retrieve(self, metadata: dict, store_backend) -> ObjectRef: """Return ObjectRef for lazy access.""" return ObjectRef( - uri=metadata['uri'], + store=store_backend, + path=metadata['path'], checksum=metadata.get('checksum') # optional verification ) ``` @@ -189,32 +189,21 @@ The `filepath` type uses the `json` core type: ```sql -- filepath column (MySQL) recording JSON NOT NULL --- Contains: {"uri": "s3://...", "checksum": "...", "size": ...} +-- Contains: {"path": "experiment_001/data.nwb", "checksum": "...", "size": ...} -- filepath column (PostgreSQL) recording JSONB NOT NULL ``` -#### Supported URI Schemes - -| Scheme | Example | Backend | -|--------|---------|---------| -| `s3://` | `s3://bucket/key/file.nwb` | S3 via fsspec | -| `gs://` | `gs://bucket/object` | Google Cloud Storage | -| `https://` | `https://example.com/data.h5` | HTTP(S) | -| `file://` | `file:///local/path/data.csv` | Local filesystem | -| `store://` | `store://main/path/file.zarr` | OAS store | - #### Key Differences from Legacy `filepath@store` | Feature | Legacy | New | |---------|--------|-----| -| Location | OAS store only | Any URI (S3, HTTP, etc.) | | Access | Copy to local stage | ObjectRef (lazy) | | Copying | Automatic | Explicit via `ref.download()` | | Streaming | No | Yes via `ref.open()` | -| Integrity | Managed by DataJoint | External (may change) | -| Store param | Required (`@store`) | Optional (embedded in URI) | +| Paths | Relative | Relative (unchanged) | +| Store param | Required (`@store`) | Required (`@store`) | ### `json` - Cross-Database JSON Type @@ -378,7 +367,7 @@ class Attachments(dj.Manual): │ │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types │ -│ longblob content object filepath json │ +│ longblob content object filepath@s json │ │ content@s object@s │ ├───────────────────────────────────────────────────────────────────┤ │ Database Types │ @@ -399,7 +388,7 @@ class Attachments(dj.Manual): | `` | `content@s` | `_content/{hash}` | Yes | Local file path | | `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | | `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `filepath` | `json` | External (any URI) | No | ObjectRef | +| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef | ## Reference Counting for Content Type @@ -448,37 +437,39 @@ def garbage_collect(project): ## Core Type Comparison -| Feature | `object` | `content` | `filepath` | -|---------|----------|-----------|------------| -| Location | OAS store | OAS store | Anywhere (URI) | -| Addressing | Primary key | Content hash | User URI | +| Feature | `object` | `content` | `filepath@store` | +|---------|----------|-----------|------------------| +| Location | OAS store | OAS store | Configured store | +| Addressing | Primary key | Content hash | Relative path | | Path control | DataJoint | DataJoint | User | | Deduplication | No | Yes | No | | Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) | | Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) | -| GC | Deleted with row | Reference counted | N/A (external) | -| Integrity | DataJoint managed | DataJoint managed | External (no guarantees) | +| GC | Deleted with row | Reference counted | N/A (user managed) | +| Integrity | DataJoint managed | DataJoint managed | User managed | **When to use each:** - **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) - **`content`**: Deduplicated serialized data or file attachments via ``, `` -- **`filepath`**: External references (S3, URLs, etc.) not managed by DataJoint +- **`filepath@store`**: Portable references to files in configured stores +- **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed ## Key Design Decisions -1. **Layered architecture**: Core types (`object`, `content`, `filepath`, `json`) separate from AttributeTypes +1. **Layered architecture**: Core types (`object`, `content`, `filepath@store`, `json`) separate from AttributeTypes 2. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores -3. **Filepath as reference tracker**: Not an OAS region - tracks external URIs (S3, HTTP, etc.) -4. **Content type**: Single-blob, content-addressed, deduplicated storage -5. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB) -6. **Parameterized types**: `` passes parameter to underlying dtype -7. **Naming convention**: +3. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability +4. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent +5. **Content type**: Single-blob, content-addressed, deduplicated storage +6. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB) +7. **Parameterized types**: `` passes parameter to underlying dtype +8. **Naming convention**: - `` = internal serialized (database) - `` = external serialized (content-addressed) - `` = internal file (single file) - `` = external file (single file) -8. **Transparent access**: AttributeTypes return Python objects or file paths -9. **Lazy access**: `object`, `object@store`, and `filepath` return ObjectRef +9. **Transparent access**: AttributeTypes return Python objects or file paths +10. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef ## Migration from Legacy Types @@ -550,5 +541,4 @@ def migrate_external_store(schema, store_name): 1. Should `content` without `@store` use a default store, or require explicit store? 2. Should we support `` without `@store` syntax (implying default store)? -3. Should `filepath` without `@store` be supported (using default store)? -4. How long should the backward compatibility layer support legacy `~external_*` format? +3. How long should the backward compatibility layer support legacy `~external_*` format? From b9b6e34f9196b9c082fb6df37e4058befddd02d1 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:25:46 +0000 Subject: [PATCH 20/42] Simplify to two-layer architecture: database types + AttributeTypes - Remove "core types" concept - all storage types are now AttributeTypes - Built-in AttributeTypes (object, content, filepath@store) use json dtype - JSON stores metadata: path, hash, store name, size, etc. - User-defined AttributeTypes can compose built-in ones (e.g., uses content) - Clearer separation: database types (json, longblob) vs AttributeTypes (encode/decode) Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 226 +++++++++++-------- 1 file changed, 137 insertions(+), 89 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index f34d1b84a..32083a88e 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -2,11 +2,14 @@ ## Overview -This document defines a layered storage architecture: +This document defines a two-layer storage architecture: -1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc. -2. **Core DataJoint types**: `object`, `content`, `filepath`, `json` (and `@store` variants where applicable) -3. **AttributeTypes**: ``, ``, ``, etc. (built on top of core types) +1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc. (MySQL/PostgreSQL native) +2. **AttributeTypes**: Custom types with `encode()`/`decode()` semantics + +All DataJoint storage types (`object`, `content`, `filepath@store`, ``, etc.) are +implemented as **AttributeTypes**. Some are built-in (auto-registered, use `dj.config` for stores) +while others are user-defined. ### OAS Storage Regions @@ -20,17 +23,21 @@ This document defines a layered storage architecture: `filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access. For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. -## Core Types +## Built-in AttributeTypes + +Built-in types are auto-registered and use `dj.config['stores']` for store configuration. +They use `json` as their database dtype to store metadata. ### `object` / `object@store` - Path-Addressed Storage -**Already implemented.** OAS (Object-Augmented Schema) storage: +**Built-in AttributeType.** OAS (Object-Augmented Schema) storage: - Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/` - One-to-one relationship with table row - Deleted when row is deleted - Returns `ObjectRef` for lazy access - Supports direct writes (Zarr, HDF5) via fsspec +- **dtype**: `json` (stores path, store name, metadata) ```python class Analysis(dj.Computed): @@ -42,9 +49,34 @@ class Analysis(dj.Computed): """ ``` +#### Implementation + +```python +class ObjectType(AttributeType): + """Built-in AttributeType for path-addressed OAS storage.""" + type_name = "object" + dtype = "json" + + def encode(self, value, *, key=None, store_name=None) -> dict: + store = get_store(store_name or dj.config['stores']['default']) + path = self._compute_path(key) # {schema}/{table}/{pk}/{attr}/ + store.put(path, value) + return { + "path": path, + "store": store_name, + # Additional metadata (size, timestamps, etc.) + } + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + return ObjectRef( + store=get_store(stored["store"]), + path=stored["path"] + ) +``` + ### `content` / `content@store` - Content-Addressed Storage -**New core type.** Content-addressed storage with deduplication: +**Built-in AttributeType.** Content-addressed storage with deduplication: - **Single blob only**: stores a single file or serialized object (not folders) - **Per-project scope**: content is shared across all schemas in a project (not per-schema) @@ -53,6 +85,7 @@ class Analysis(dj.Computed): - Reference counted for garbage collection - Deduplication: identical content stored once across the entire project - For folders/complex objects, use `object` type instead +- **dtype**: `json` (stores hash, store name, size, metadata) ``` store_root/ @@ -63,58 +96,63 @@ store_root/ └── {hash[:2]}/{hash[2:4]}/{hash} ``` -#### Content Type Behavior - -The `content` core type: -- Accepts `bytes` on insert -- Computes SHA256 hash of the content -- Stores in `_content/{hash}/` if not already present (deduplication) -- Returns `bytes` on fetch (transparent retrieval) -- Registers in `ContentRegistry` for GC tracking +#### Implementation ```python -# Core type behavior (built-in, not an AttributeType) -class ContentType: - """Core content-addressed storage type.""" +class ContentType(AttributeType): + """Built-in AttributeType for content-addressed storage.""" + type_name = "content" + dtype = "json" - def store(self, data: bytes, store_backend) -> str: - """Store content, return hash.""" + def encode(self, data: bytes, *, key=None, store_name=None) -> dict: + """Store content, return metadata as JSON.""" content_hash = hashlib.sha256(data).hexdigest() + store = get_store(store_name or dj.config['stores']['default']) path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - if not store_backend.exists(path): - store_backend.put(path, data) + if not store.exists(path): + store.put(path, data) ContentRegistry().insert1({ 'content_hash': content_hash, - 'store': store_backend.name, + 'store': store_name, 'size': len(data) - }) + }, skip_duplicates=True) - return content_hash + return { + "hash": content_hash, + "store": store_name, + "size": len(data) + } - def retrieve(self, content_hash: str, store_backend) -> bytes: + def decode(self, stored: dict, *, key=None) -> bytes: """Retrieve content by hash.""" - path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - return store_backend.get(path) + store = get_store(stored["store"]) + path = f"_content/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" + return store.get(path) ``` #### Database Column -The `content` type stores a `char(64)` hash in the database: +The `content` type stores JSON metadata: ```sql --- content column -features CHAR(64) NOT NULL -- SHA256 hex hash +-- content column (MySQL) +features JSON NOT NULL +-- Contains: {"hash": "abc123...", "store": "main", "size": 12345} + +-- content column (PostgreSQL) +features JSONB NOT NULL ``` ### `filepath@store` - Portable External Reference -**Upgraded from legacy.** Relative path references within configured stores: +**Built-in AttributeType.** Relative path references within configured stores: - **Relative paths**: paths within a configured store (portable across environments) - **Store-aware**: resolves paths against configured store backend - Returns `ObjectRef` for lazy access via fsspec - Stores optional checksum for verification +- **dtype**: `json` (stores path, store name, checksum, metadata) **Key benefit**: Portability. The path is relative to the store, so pipelines can be moved between environments (dev → prod, cloud → local) by changing store configuration without @@ -154,42 +192,43 @@ ref.open() # fsspec streaming access For arbitrary URLs (S3, HTTP, etc.) where you don't need ObjectRef semantics, just use `varchar`. A string is simpler and more transparent. -#### Filepath Type Behavior +#### Implementation ```python -# Core type behavior -class FilepathType: - """Core external reference type with store-relative paths.""" +class FilepathType(AttributeType): + """Built-in AttributeType for store-relative file references.""" + type_name = "filepath" + dtype = "json" - def store(self, relative_path: str, store_backend, compute_checksum: bool = False) -> dict: + def encode(self, relative_path: str, *, key=None, store_name=None, + compute_checksum: bool = False) -> dict: """Register reference to file in store.""" - metadata = {'path': relative_path} + store = get_store(store_name) # store_name required for filepath + metadata = {'path': relative_path, 'store': store_name} if compute_checksum: - full_path = store_backend.resolve(relative_path) - if store_backend.exists(full_path): - metadata['checksum'] = compute_file_checksum(store_backend, full_path) - metadata['size'] = store_backend.size(full_path) + full_path = store.resolve(relative_path) + if store.exists(full_path): + metadata['checksum'] = compute_file_checksum(store, full_path) + metadata['size'] = store.size(full_path) return metadata - def retrieve(self, metadata: dict, store_backend) -> ObjectRef: + def decode(self, stored: dict, *, key=None) -> ObjectRef: """Return ObjectRef for lazy access.""" return ObjectRef( - store=store_backend, - path=metadata['path'], - checksum=metadata.get('checksum') # optional verification + store=get_store(stored['store']), + path=stored['path'], + checksum=stored.get('checksum') # optional verification ) ``` #### Database Column -The `filepath` type uses the `json` core type: - ```sql -- filepath column (MySQL) recording JSON NOT NULL --- Contains: {"path": "experiment_001/data.nwb", "checksum": "...", "size": ...} +-- Contains: {"path": "experiment_001/data.nwb", "store": "main", "checksum": "...", "size": ...} -- filepath column (PostgreSQL) recording JSONB NOT NULL @@ -205,49 +244,52 @@ recording JSONB NOT NULL | Paths | Relative | Relative (unchanged) | | Store param | Required (`@store`) | Required (`@store`) | +## Database Types + ### `json` - Cross-Database JSON Type -**New core type.** JSON storage compatible across MySQL and PostgreSQL: +JSON storage compatible across MySQL and PostgreSQL: ```sql -- MySQL column_name JSON NOT NULL --- PostgreSQL +-- PostgreSQL (uses JSONB for better indexing) column_name JSONB NOT NULL ``` -The `json` core type: +The `json` database type: +- Used as dtype by built-in AttributeTypes (`object`, `content`, `filepath@store`) - Stores arbitrary JSON-serializable data - Automatically uses appropriate type for database backend - Supports JSON path queries where available ## Parameterized AttributeTypes -AttributeTypes can be parameterized with `` syntax. The parameter is passed -through to the underlying dtype: +AttributeTypes can be parameterized with `` syntax. The parameter specifies +which store to use: ```python class AttributeType: - type_name: str # Name used in - dtype: str # Base underlying type + type_name: str # Name used in or as bare type + dtype: str # Database type or built-in AttributeType - # When user writes , resolved dtype becomes: - # f"{dtype}@{param}" if param specified, else dtype + # When user writes type_name@param, resolved store becomes param ``` **Resolution examples:** ``` - → dtype = "content" → default store - → dtype = "content@cold" → cold store - → dtype = "longblob" → database - → ERROR: longblob doesn't support parameters + → uses content type → default store + → uses content type → cold store + → dtype = "longblob" → database (no store) +object@cold → uses object type → cold store ``` -This means `` and `` share the same AttributeType class - the -parameter flows through to the core type, which validates whether it supports `@store`. +AttributeTypes can use other AttributeTypes as their dtype (composition): +- `` uses `content` - adds djblob serialization on top of content-addressed storage +- `` uses `content` - adds filename preservation on top of content-addressed storage -## AttributeTypes (Built on Core Types) +## User-Defined AttributeTypes ### `` - Internal Serialized Blob @@ -364,31 +406,35 @@ class Attachments(dj.Manual): ``` ┌───────────────────────────────────────────────────────────────────┐ │ AttributeTypes │ -│ │ +│ │ +│ Built-in: object content filepath@s │ +│ User: │ ├───────────────────────────────────────────────────────────────────┤ -│ Core DataJoint Types │ -│ longblob content object filepath@s json │ -│ content@s object@s │ -├───────────────────────────────────────────────────────────────────┤ -│ Database Types │ -│ LONGBLOB CHAR(64) JSON JSON/JSONB VARCHAR etc. │ -│ (MySQL) (PostgreSQL) │ +│ Database Types (dtype) │ +│ │ +│ LONGBLOB JSON/JSONB VARCHAR INT etc. │ └───────────────────────────────────────────────────────────────────┘ ``` +All storage types are AttributeTypes: +- **Built-in**: `object`, `content`, `filepath@store` - auto-registered, use `dj.config` +- **User-defined**: ``, ``, ``, ``, `` - registered via `@dj.register_type` + ## Storage Comparison -| Type | Core Type | Storage Location | Dedup | Returns | -|------|-----------|------------------|-------|---------| +| Type | dtype | Storage Location | Dedup | Returns | +|------|-------|------------------|-------|---------| +| `object` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `object@s` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `content` | `json` | `_content/{hash}` | Yes | bytes | +| `content@s` | `json` | `_content/{hash}` | Yes | bytes | +| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef | | `` | `longblob` | Database | No | Python object | | `` | `content` | `_content/{hash}` | Yes | Python object | | `` | `content@s` | `_content/{hash}` | Yes | Python object | | `` | `longblob` | Database | No | Local file path | | `` | `content` | `_content/{hash}` | Yes | Local file path | | `` | `content@s` | `_content/{hash}` | Yes | Local file path | -| `object` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `object@s` | — | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef | ## Reference Counting for Content Type @@ -435,10 +481,11 @@ def garbage_collect(project): (ContentRegistry() & {'content_hash': content_hash}).delete() ``` -## Core Type Comparison +## Built-in AttributeType Comparison | Feature | `object` | `content` | `filepath@store` | |---------|----------|-----------|------------------| +| dtype | `json` | `json` | `json` | | Location | OAS store | OAS store | Configured store | | Addressing | Primary key | Content hash | Relative path | | Path control | DataJoint | DataJoint | User | @@ -456,20 +503,21 @@ def garbage_collect(project): ## Key Design Decisions -1. **Layered architecture**: Core types (`object`, `content`, `filepath@store`, `json`) separate from AttributeTypes -2. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores -3. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability -4. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent -5. **Content type**: Single-blob, content-addressed, deduplicated storage -6. **JSON core type**: Cross-database compatible (MySQL JSON, PostgreSQL JSONB) -7. **Parameterized types**: `` passes parameter to underlying dtype -8. **Naming convention**: +1. **Two-layer architecture**: Database types (`json`, `longblob`, etc.) and AttributeTypes +2. **All storage types are AttributeTypes**: Built-in (`object`, `content`, `filepath@store`) and user-defined (``, etc.) +3. **Built-in types use JSON dtype**: Stores metadata (path, hash, store name, etc.) in JSON columns +4. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +5. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability +6. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent +7. **Content type**: Single-blob, content-addressed, deduplicated storage +8. **Parameterized types**: `` passes parameter to underlying dtype +9. **Naming convention**: - `` = internal serialized (database) - `` = external serialized (content-addressed) - `` = internal file (single file) - `` = external file (single file) -9. **Transparent access**: AttributeTypes return Python objects or file paths -10. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef +10. **Transparent access**: AttributeTypes return Python objects or file paths +11. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef ## Migration from Legacy Types From 2a5d161fe8dccfa5475d89563bcc877c1183ccb1 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:32:42 +0000 Subject: [PATCH 21/42] Add three-layer type architecture with core DataJoint types Layer 1: Native database types (FLOAT, TINYINT, etc.) - backend-specific, discouraged Layer 2: Core DataJoint types (float32, uint8, bool, json) - standardized, scientist-friendly Layer 3: AttributeTypes (object, content, , etc.) - encode/decode, composable Core types provide: - Consistent interface across MySQL and PostgreSQL - Scientist-friendly names (float32 vs FLOAT, uint8 vs TINYINT UNSIGNED) - Automatic backend translation Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 146 +++++++++++++------ 1 file changed, 103 insertions(+), 43 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 32083a88e..0d4223a96 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -2,14 +2,31 @@ ## Overview -This document defines a two-layer storage architecture: +This document defines a three-layer type architecture: -1. **Database types**: `longblob`, `varchar`, `int`, `json`, etc. (MySQL/PostgreSQL native) -2. **AttributeTypes**: Custom types with `encode()`/`decode()` semantics +1. **Native database types** - Backend-specific (`FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB`). Discouraged for direct use. +2. **Core DataJoint types** - Standardized across backends, scientist-friendly (`float32`, `uint8`, `bool`, `json`). +3. **AttributeTypes** - Programmatic types with `encode()`/`decode()` semantics. Composable. -All DataJoint storage types (`object`, `content`, `filepath@store`, ``, etc.) are -implemented as **AttributeTypes**. Some are built-in (auto-registered, use `dj.config` for stores) -while others are user-defined. +``` +┌───────────────────────────────────────────────────────────────────┐ +│ AttributeTypes (Layer 3) │ +│ │ +│ Built-in: object content filepath@s │ +│ User: ... │ +├───────────────────────────────────────────────────────────────────┤ +│ Core DataJoint Types (Layer 2) │ +│ │ +│ int8 int16 int32 int64 float32 float64 bool decimal │ +│ uint8 uint16 uint32 uint64 varchar char uuid date │ +│ json longblob blob timestamp datetime enum │ +├───────────────────────────────────────────────────────────────────┤ +│ Native Database Types (Layer 1) │ +│ │ +│ MySQL: TINYINT SMALLINT INT BIGINT FLOAT DOUBLE ... │ +│ PostgreSQL: SMALLINT INTEGER BIGINT REAL DOUBLE PRECISION │ +└───────────────────────────────────────────────────────────────────┘ +``` ### OAS Storage Regions @@ -23,10 +40,68 @@ while others are user-defined. `filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access. For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. -## Built-in AttributeTypes +## Core DataJoint Types (Layer 2) + +Core types provide a standardized, scientist-friendly interface that works identically across +MySQL and PostgreSQL backends. Users should prefer these over native database types. + +### Numeric Types + +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` (clamped) | +| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` | +| `int32` | 32-bit signed | `INT` | `INTEGER` | +| `int64` | 64-bit signed | `BIGINT` | `BIGINT` | +| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` (checked) | +| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` (checked) | +| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` (checked) | +| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` | +| `float32` | 32-bit float | `FLOAT` | `REAL` | +| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` | +| `decimal(p,s)` | Fixed precision | `DECIMAL(p,s)` | `NUMERIC(p,s)` | + +### String Types + +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` | +| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | + +### Boolean + +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `bool` | True/False | `TINYINT(1)` | `BOOLEAN` | + +### Date/Time Types -Built-in types are auto-registered and use `dj.config['stores']` for store configuration. -They use `json` as their database dtype to store metadata. +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `date` | Date only | `DATE` | `DATE` | +| `datetime` | Date and time | `DATETIME(6)` | `TIMESTAMP` | +| `timestamp` | Auto-updating | `TIMESTAMP` | `TIMESTAMP` | +| `time` | Time only | `TIME` | `TIME` | + +### Binary Types + +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `blob` | Binary up to 64KB | `BLOB` | `BYTEA` | +| `longblob` | Binary up to 4GB | `LONGBLOB` | `BYTEA` | + +### Special Types + +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `json` | JSON document | `JSON` | `JSONB` | +| `uuid` | UUID | `CHAR(36)` | `UUID` | +| `enum(...)` | Enumeration | `ENUM(...)` | `VARCHAR` + CHECK | + +## AttributeTypes (Layer 3) + +AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are +composable and can be built-in or user-defined. ### `object` / `object@store` - Path-Addressed Storage @@ -401,25 +476,6 @@ class Attachments(dj.Manual): """ ``` -## Type Layering Summary - -``` -┌───────────────────────────────────────────────────────────────────┐ -│ AttributeTypes │ -│ │ -│ Built-in: object content filepath@s │ -│ User: │ -├───────────────────────────────────────────────────────────────────┤ -│ Database Types (dtype) │ -│ │ -│ LONGBLOB JSON/JSONB VARCHAR INT etc. │ -└───────────────────────────────────────────────────────────────────┘ -``` - -All storage types are AttributeTypes: -- **Built-in**: `object`, `content`, `filepath@store` - auto-registered, use `dj.config` -- **User-defined**: ``, ``, ``, ``, `` - registered via `@dj.register_type` - ## Storage Comparison | Type | dtype | Storage Location | Dedup | Returns | @@ -503,21 +559,25 @@ def garbage_collect(project): ## Key Design Decisions -1. **Two-layer architecture**: Database types (`json`, `longblob`, etc.) and AttributeTypes -2. **All storage types are AttributeTypes**: Built-in (`object`, `content`, `filepath@store`) and user-defined (``, etc.) -3. **Built-in types use JSON dtype**: Stores metadata (path, hash, store name, etc.) in JSON columns -4. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores -5. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability -6. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent -7. **Content type**: Single-blob, content-addressed, deduplicated storage -8. **Parameterized types**: `` passes parameter to underlying dtype -9. **Naming convention**: - - `` = internal serialized (database) - - `` = external serialized (content-addressed) - - `` = internal file (single file) - - `` = external file (single file) -10. **Transparent access**: AttributeTypes return Python objects or file paths -11. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef +1. **Three-layer architecture**: + - Layer 1: Native database types (backend-specific, discouraged) + - Layer 2: Core DataJoint types (standardized, scientist-friendly) + - Layer 3: AttributeTypes (encode/decode, composable) +2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)` +3. **AttributeTypes are composable**: `` uses `content`, which uses `json` +4. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.) +5. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +6. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability +7. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent +8. **Content type**: Single-blob, content-addressed, deduplicated storage +9. **Parameterized types**: `type@param` passes store parameter +10. **Naming convention**: + - `` = internal serialized (database) + - `` = external serialized (content-addressed) + - `` = internal file (single file) + - `` = external file (single file) +11. **Transparent access**: AttributeTypes return Python objects or file paths +12. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef ## Migration from Legacy Types From d36739dac2a1e8e95dabbe4420c7c5bd332200ed Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:37:03 +0000 Subject: [PATCH 22/42] Use angle brackets for all AttributeTypes in definitions All AttributeTypes (Layer 3) now use angle bracket syntax in table definitions: - Core types (Layer 2): int32, float64, varchar(255) - no brackets - AttributeTypes (Layer 3): , , - angle brackets This clear visual distinction helps users immediately identify: - Core types: direct database mapping - AttributeTypes: encode/decode transformation Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 99 ++++++++++---------- 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 0d4223a96..3d70c908e 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -12,8 +12,8 @@ This document defines a three-layer type architecture: ┌───────────────────────────────────────────────────────────────────┐ │ AttributeTypes (Layer 3) │ │ │ -│ Built-in: object content filepath@s │ -│ User: ... │ +│ Built-in: │ +│ User: ... │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types (Layer 2) │ │ │ @@ -28,6 +28,10 @@ This document defines a three-layer type architecture: └───────────────────────────────────────────────────────────────────┘ ``` +**Syntax distinction:** +- Core types: `int32`, `float64`, `varchar(255)` - no brackets +- AttributeTypes: ``, ``, `` - angle brackets + ### OAS Storage Regions | Region | Path Pattern | Addressing | Use Case | @@ -37,7 +41,7 @@ This document defines a three-layer type architecture: ### External References -`filepath@store` provides portable relative paths within configured stores with lazy ObjectRef access. +`` provides portable relative paths within configured stores with lazy ObjectRef access. For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. ## Core DataJoint Types (Layer 2) @@ -103,7 +107,7 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are composable and can be built-in or user-defined. -### `object` / `object@store` - Path-Addressed Storage +### `` / `` - Path-Addressed Storage **Built-in AttributeType.** OAS (Object-Augmented Schema) storage: @@ -119,8 +123,8 @@ class Analysis(dj.Computed): definition = """ -> Recording --- - results : object # default store - archive : object@cold # specific store + results : # default store + archive : # specific store """ ``` @@ -149,7 +153,7 @@ class ObjectType(AttributeType): ) ``` -### `content` / `content@store` - Content-Addressed Storage +### `` / `` - Content-Addressed Storage **Built-in AttributeType.** Content-addressed storage with deduplication: @@ -208,7 +212,7 @@ class ContentType(AttributeType): #### Database Column -The `content` type stores JSON metadata: +The `` type stores JSON metadata: ```sql -- content column (MySQL) @@ -219,7 +223,7 @@ features JSON NOT NULL features JSONB NOT NULL ``` -### `filepath@store` - Portable External Reference +### `` - Portable External Reference **Built-in AttributeType.** Relative path references within configured stores: @@ -236,9 +240,9 @@ updating data. ```python class RawData(dj.Manual): definition = """ - session_id : int + session_id : int32 --- - recording : filepath@main # relative path within 'main' store + recording : # relative path within 'main' store """ # Insert - user provides relative path within the store @@ -254,13 +258,13 @@ ref.download('/local/path') # explicit download ref.open() # fsspec streaming access ``` -#### When to Use `filepath@store` vs `varchar` +#### When to Use `` vs `varchar` | Use Case | Recommended Type | |----------|------------------| -| Need ObjectRef/lazy access | `filepath@store` | -| Need portability (relative paths) | `filepath@store` | -| Want checksum verification | `filepath@store` | +| Need ObjectRef/lazy access | `` | +| Need portability (relative paths) | `` | +| Want checksum verification | `` | | Just storing a URL string | `varchar` | | External URLs you don't control | `varchar` | @@ -309,7 +313,7 @@ recording JSON NOT NULL recording JSONB NOT NULL ``` -#### Key Differences from Legacy `filepath@store` +#### Key Differences from Legacy `filepath@store` (now ``) | Feature | Legacy | New | |---------|--------|-----| @@ -334,7 +338,7 @@ column_name JSONB NOT NULL ``` The `json` database type: -- Used as dtype by built-in AttributeTypes (`object`, `content`, `filepath@store`) +- Used as dtype by built-in AttributeTypes (``, ``, ``) - Stores arbitrary JSON-serializable data - Automatically uses appropriate type for database backend - Supports JSON path queries where available @@ -354,15 +358,15 @@ class AttributeType: **Resolution examples:** ``` - → uses content type → default store - → uses content type → cold store - → dtype = "longblob" → database (no store) -object@cold → uses object type → cold store + → uses type → default store + → uses type → cold store + → dtype = "longblob" → database (no store) + → uses type → cold store ``` AttributeTypes can use other AttributeTypes as their dtype (composition): -- `` uses `content` - adds djblob serialization on top of content-addressed storage -- `` uses `content` - adds filename preservation on top of content-addressed storage +- `` uses `` - adds djblob serialization on top of content-addressed storage +- `` uses `` - adds filename preservation on top of content-addressed storage ## User-Defined AttributeTypes @@ -480,17 +484,17 @@ class Attachments(dj.Manual): | Type | dtype | Storage Location | Dedup | Returns | |------|-------|------------------|-------|---------| -| `object` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `object@s` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `content` | `json` | `_content/{hash}` | Yes | bytes | -| `content@s` | `json` | `_content/{hash}` | Yes | bytes | -| `filepath@s` | `json` | Configured store (relative path) | No | ObjectRef | +| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `_content/{hash}` | Yes | bytes | +| `` | `json` | `_content/{hash}` | Yes | bytes | +| `` | `json` | Configured store (relative path) | No | ObjectRef | | `` | `longblob` | Database | No | Python object | -| `` | `content` | `_content/{hash}` | Yes | Python object | -| `` | `content@s` | `_content/{hash}` | Yes | Python object | +| `` | `` | `_content/{hash}` | Yes | Python object | +| `` | `` | `_content/{hash}` | Yes | Python object | | `` | `longblob` | Database | No | Local file path | -| `` | `content` | `_content/{hash}` | Yes | Local file path | -| `` | `content@s` | `_content/{hash}` | Yes | Local file path | +| `` | `` | `_content/{hash}` | Yes | Local file path | +| `` | `` | `_content/{hash}` | Yes | Local file path | ## Reference Counting for Content Type @@ -539,8 +543,8 @@ def garbage_collect(project): ## Built-in AttributeType Comparison -| Feature | `object` | `content` | `filepath@store` | -|---------|----------|-----------|------------------| +| Feature | `` | `` | `` | +|---------|------------|-------------|---------------------| | dtype | `json` | `json` | `json` | | Location | OAS store | OAS store | Configured store | | Addressing | Primary key | Content hash | Relative path | @@ -552,9 +556,9 @@ def garbage_collect(project): | Integrity | DataJoint managed | DataJoint managed | User managed | **When to use each:** -- **`object`**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) -- **`content`**: Deduplicated serialized data or file attachments via ``, `` -- **`filepath@store`**: Portable references to files in configured stores +- **``**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) +- **``**: Deduplicated serialized data or file attachments via ``, `` +- **``**: Portable references to files in configured stores - **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed ## Key Design Decisions @@ -564,20 +568,21 @@ def garbage_collect(project): - Layer 2: Core DataJoint types (standardized, scientist-friendly) - Layer 3: AttributeTypes (encode/decode, composable) 2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)` -3. **AttributeTypes are composable**: `` uses `content`, which uses `json` -4. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.) -5. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores -6. **Filepath for portability**: `filepath@store` uses relative paths within stores for environment portability -7. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent -8. **Content type**: Single-blob, content-addressed, deduplicated storage -9. **Parameterized types**: `type@param` passes store parameter -10. **Naming convention**: +3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types +4. **AttributeTypes are composable**: `` uses ``, which uses `json` +5. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.) +6. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +7. **Filepath for portability**: `` uses relative paths within stores for environment portability +8. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent +9. **Content type**: Single-blob, content-addressed, deduplicated storage +10. **Parameterized types**: `` passes store parameter +11. **Naming convention**: - `` = internal serialized (database) - `` = external serialized (content-addressed) - `` = internal file (single file) - `` = external file (single file) -11. **Transparent access**: AttributeTypes return Python objects or file paths -12. **Lazy access**: `object`, `object@store`, and `filepath@store` return ObjectRef +12. **Transparent access**: AttributeTypes return Python objects or file paths +13. **Lazy access**: ``, ``, and `` return ObjectRef ## Migration from Legacy Types From 5c1e854e64497a2d1b37b56c64e9402e82b755c8 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 06:46:28 +0000 Subject: [PATCH 23/42] Add implementation plan for storage types redesign Seven-phase implementation plan covering: - Phase 1: Core type system foundation (type mappings, store parameters) - Phase 2: Content-addressed storage ( type, ContentRegistry) - Phase 3: User-defined AttributeTypes (, , , ) - Phase 4: Insert and fetch integration (type composition) - Phase 5: Garbage collection (project-wide GC scanner) - Phase 6: Migration utilities (legacy external stores) - Phase 7: Documentation and testing Estimated effort: 24-32 days across all phases Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 570 ++++++++++++++++++ 1 file changed, 570 insertions(+) create mode 100644 docs/src/design/tables/storage-types-implementation-plan.md diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md new file mode 100644 index 000000000..13d2e45d3 --- /dev/null +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -0,0 +1,570 @@ +# DataJoint Storage Types Redesign - Implementation Plan + +## Executive Summary + +This plan describes the implementation of a three-layer type architecture for DataJoint, building on the existing `AttributeType` infrastructure. The key goals are: + +1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes) +2. Implement content-addressed storage with deduplication +3. Provide composable, user-friendly types (``, ``, ``) +4. Enable project-wide garbage collection via `ContentRegistry` +5. Maintain backward compatibility with existing schemas + +--- + +## Phase 1: Core Type System Foundation + +**Goal**: Establish the complete Layer 2 core type mappings and enhance the AttributeType infrastructure. + +### 1.1 Expand Core Type Mappings + +**Files to modify:** +- `src/datajoint/declare.py` + +**Current state**: `SQL_TYPE_ALIASES` already maps some types (float32, int32, etc.) + +**Changes needed**: +1. Complete the type mappings as per spec: + ``` + Core Type -> MySQL Type + int8 -> TINYINT + uint8 -> TINYINT UNSIGNED + int16 -> SMALLINT + ... + json -> JSON + uuid -> BINARY(16) or CHAR(36) + decimal -> DECIMAL(p,s) + ``` + +2. Add PostgreSQL mappings for future support (can be placeholder initially) + +**Dependencies**: None + +### 1.2 Enhance AttributeType with Store Parameter Support + +**Files to modify:** +- `src/datajoint/attribute_type.py` + +**Current state**: Types don't support `@store` parameter syntax + +**Changes needed**: +1. Add `store_name` property to `AttributeType` +2. Modify `resolve_dtype()` to handle `` syntax +3. Add `get_type_with_store(name_with_store)` helper that parses `xblob@cold` format + +```python +def parse_type_spec(spec: str) -> tuple[str, str | None]: + """Parse '' or '' into (type_name, store_name).""" + spec = spec.strip("<>") + if "@" in spec: + type_name, store_name = spec.split("@", 1) + return type_name, store_name + return spec, None +``` + +**Dependencies**: None + +### 1.3 Update Heading and Declaration Parsing + +**Files to modify:** +- `src/datajoint/heading.py` +- `src/datajoint/declare.py` + +**Changes needed**: +1. Update `TYPE_PATTERN` to recognize new AttributeType patterns +2. Store `store_name` in attribute metadata for parameterized types +3. Update `compile_attribute()` to handle `` syntax +4. Update `_init_from_database()` to reconstruct store information + +**Dependencies**: Phase 1.2 + +--- + +## Phase 2: Content-Addressed Storage Implementation + +**Goal**: Implement the `` type with content-addressed storage and deduplication. + +### 2.1 Create ContentRegistry Table + +**New file to create:** +- `src/datajoint/content_registry.py` + +**Implementation**: +```python +class ContentRegistry: + """ + Project-level content registry for content-addressed storage. + Stored in a designated database (e.g., `{project}_content`). + """ + definition = """ + # Content-addressed object registry (project-wide) + content_hash : char(64) # SHA256 hex + --- + store : varchar(64) # Store name + size : bigint unsigned # Size in bytes + created : timestamp DEFAULT CURRENT_TIMESTAMP + """ +``` + +Key features: +- Auto-create the registry database on first use +- Methods: `insert_content()`, `get_content()`, `increment_ref()`, `decrement_ref()` +- Thread-safe reference counting (if needed) + +**Dependencies**: None + +### 2.2 Implement ContentType AttributeType + +**Files to modify:** +- `src/datajoint/attribute_type.py` + +**New built-in type**: +```python +class ContentType(AttributeType): + """Built-in AttributeType for content-addressed storage.""" + type_name = "content" + dtype = "json" + + def encode(self, data: bytes, *, key=None, store_name=None) -> dict: + """Store content, return metadata as JSON.""" + content_hash = hashlib.sha256(data).hexdigest() + path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + # Store if not exists, register in ContentRegistry + ... + return {"hash": content_hash, "store": store_name, "size": len(data)} + + def decode(self, stored: dict, *, key=None) -> bytes: + """Retrieve content by hash.""" + ... +``` + +**Dependencies**: Phase 2.1 + +### 2.3 Implement Content Storage Backend Methods + +**Files to modify:** +- `src/datajoint/storage.py` + +**Changes needed**: +1. Add `put_content()` method with deduplication +2. Add `get_content()` method with hash verification +3. Add `compute_content_hash()` utility +4. Add content path generation: `_content/{hash[:2]}/{hash[2:4]}/{hash}` + +**Dependencies**: None + +--- + +## Phase 3: User-Defined AttributeTypes + +**Goal**: Implement the standard user-facing types that compose with `` and ``. + +### 3.1 Implement XBlobType (External Blob) + +**Files to modify:** +- `src/datajoint/attribute_type.py` + +```python +@register_type +class XBlobType(AttributeType): + """External serialized blob using content-addressed storage.""" + type_name = "xblob" + dtype = "" # Composition: uses ContentType + + def encode(self, value, *, key=None) -> bytes: + from . import blob + return blob.pack(value, compress=True) + + def decode(self, stored, *, key=None) -> Any: + from . import blob + return blob.unpack(stored) +``` + +**Key behavior**: Serializes to djblob format, stores via content-addressed storage + +**Dependencies**: Phase 2.2 + +### 3.2 Implement AttachType and XAttachType + +**Files to modify:** +- `src/datajoint/attribute_type.py` + +```python +@register_type +class AttachType(AttributeType): + """Internal file attachment stored in database.""" + type_name = "attach" + dtype = "longblob" + + def encode(self, filepath, *, key=None) -> bytes: + path = Path(filepath) + return path.name.encode() + b"\0" + path.read_bytes() + + def decode(self, stored, *, key=None) -> str: + filename, contents = stored.split(b"\0", 1) + # Write to download_path and return path + ... + +@register_type +class XAttachType(AttributeType): + """External file attachment using content-addressed storage.""" + type_name = "xattach" + dtype = "" + + def encode(self, filepath, *, key=None) -> bytes: + path = Path(filepath) + return path.name.encode() + b"\0" + path.read_bytes() + + def decode(self, stored, *, key=None) -> str: + # Same as AttachType.decode() + ... +``` + +**Dependencies**: Phase 2.2 + +### 3.3 Implement FilepathType + +**Files to modify:** +- `src/datajoint/attribute_type.py` + +```python +@register_type +class FilepathType(AttributeType): + """Portable relative path reference within configured stores.""" + type_name = "filepath" + dtype = "json" + + def encode(self, relative_path: str, *, key=None, store_name=None, + compute_checksum: bool = False) -> dict: + """Register reference to file in store.""" + store = get_store(store_name) # Required for filepath + metadata = {'path': relative_path, 'store': store_name} + if compute_checksum: + # Compute checksum and size + ... + return metadata + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + """Return ObjectRef for lazy access.""" + return ObjectRef( + store=get_store(stored['store']), + path=stored['path'], + checksum=stored.get('checksum') + ) +``` + +**Key difference from legacy**: Returns `ObjectRef` instead of copying to local stage + +**Dependencies**: Existing `ObjectRef` and `StorageBackend` + +--- + +## Phase 4: Insert and Fetch Integration + +**Goal**: Update the data path to handle the new type system seamlessly. + +### 4.1 Update Insert Processing + +**Files to modify:** +- `src/datajoint/table.py` + +**Changes needed in `__make_placeholder()`**: +1. Handle type composition (resolve full type chain) +2. Pass `store_name` to `encode()` when applicable +3. Handle `` type's special behavior +4. Process `` with store parameter + +```python +def __make_placeholder(self, name, value, ...): + attr = self.heading[name] + if attr.adapter: + # Resolve type chain and pass store_name + final_dtype, type_chain = resolve_dtype(attr.adapter.dtype) + store_name = attr.store + + # Apply type chain: outer -> inner + for attr_type in type_chain: + value = attr_type.encode(value, key=key, store_name=store_name) + + # Continue with final_dtype processing + ... +``` + +**Dependencies**: Phases 1-3 + +### 4.2 Update Fetch Processing + +**Files to modify:** +- `src/datajoint/fetch.py` + +**Changes needed in `_get()`**: +1. Handle `` type: retrieve from content store +2. Handle type composition: apply decoders in reverse order +3. Handle ``: return `ObjectRef` instead of downloading + +```python +def _get(connection, attr, data, squeeze, download_path): + if attr.adapter: + final_dtype, type_chain = resolve_dtype(attr.adapter.dtype) + + # Process based on final_dtype + if final_dtype == "json": + data = json.loads(data) + elif final_dtype == "longblob": + # Handle content retrieval if needed + ... + + # Apply type chain in reverse: inner -> outer + for attr_type in reversed(type_chain): + data = attr_type.decode(data, key=key) + + return data +``` + +**Dependencies**: Phases 1-3 + +### 4.3 Update Heading Attribute Properties + +**Files to modify:** +- `src/datajoint/heading.py` + +**Changes needed**: +1. Add `is_content` property for content-addressed attributes +2. Update property detection logic for new types +3. Store composed type information for fetch/insert + +**Dependencies**: Phase 1.3 + +--- + +## Phase 5: Garbage Collection + +**Goal**: Implement project-wide garbage collection for content-addressed storage. + +### 5.1 Implement GC Scanner + +**New file to create:** +- `src/datajoint/gc.py` + +```python +def scan_content_references(project) -> set[tuple[str, str]]: + """ + Scan all schemas in project for content references. + + Returns: + Set of (content_hash, store) tuples that are referenced + """ + referenced = set() + for schema in project.schemas: + for table in schema.tables: + for attr in table.heading.attributes: + if attr.type in ('content', 'xblob', 'xattach'): + hashes = table.fetch(attr.name) + for h in hashes: + if isinstance(h, dict): + referenced.add((h['hash'], h.get('store'))) + return referenced + +def garbage_collect(project, dry_run=True) -> dict: + """ + Remove unreferenced content from storage. + + Returns: + Stats: {'scanned': N, 'orphaned': M, 'deleted': K, 'bytes_freed': B} + """ + ... +``` + +**Dependencies**: Phase 2.1 + +### 5.2 Add GC CLI Commands + +**Files to modify:** +- CLI or management interface + +**New commands**: +- `dj gc scan` - Scan and report orphaned content +- `dj gc clean` - Remove orphaned content +- `dj gc status` - Show content registry status + +**Dependencies**: Phase 5.1 + +--- + +## Phase 6: Migration Utilities + +**Goal**: Provide tools to migrate existing schemas to the new type system. + +### 6.1 Enhance Migration Module + +**Files to modify:** +- `src/datajoint/migrate.py` + +**New functions**: + +```python +def analyze_external_stores(schema) -> list[dict]: + """Analyze legacy ~external_* tables for migration.""" + ... + +def migrate_external_to_content(schema, store_name, dry_run=True) -> dict: + """ + Migrate legacy ~external_{store} to new ContentRegistry. + + Steps: + 1. Read entries from ~external_{store} + 2. For each entry: fetch content, compute SHA256 + 3. Copy to _content/{hash}/ if not exists + 4. Update referencing tables (UUID -> hash JSON) + 5. Register in ContentRegistry + """ + ... + +def migrate_blob_to_djblob(schema, dry_run=True) -> dict: + """Update implicit blob columns to use .""" + ... + +def migrate_filepath_to_new(schema, dry_run=True) -> dict: + """ + Migrate legacy filepath@store to new . + + Changes: + - UUID column -> JSON column + - Copy-based access -> ObjectRef-based access + """ + ... +``` + +### 6.2 Create Migration CLI + +**New commands**: +- `dj migrate analyze ` - Analyze migration needs +- `dj migrate external ` - Migrate external store +- `dj migrate blobs ` - Migrate blob columns +- `dj migrate status ` - Show migration status + +**Dependencies**: Phase 6.1 + +--- + +## Phase 7: Documentation and Testing + +### 7.1 Unit Tests + +**New test files:** +- `tests/test_content_type.py` - Content-addressed storage tests +- `tests/test_xblob.py` - XBlob type tests +- `tests/test_attach_types.py` - Attachment type tests +- `tests/test_filepath_new.py` - New filepath tests +- `tests/test_gc.py` - Garbage collection tests +- `tests/test_migration.py` - Migration utility tests + +**Existing test files to update:** +- `tests/test_attribute_type.py` - Add new type tests +- `tests/test_object.py` - Verify object type unchanged + +### 7.2 Integration Tests + +**Test scenarios**: +1. Insert/fetch roundtrip for all new types +2. Type composition (xblob using content) +3. Multi-schema content deduplication +4. GC with cross-schema references +5. Migration from legacy external stores +6. Backward compatibility with existing schemas + +### 7.3 Documentation + +**Files to update:** +- `docs/src/design/tables/storage-types-spec.md` - Already exists +- Create user guide for new types +- Create migration guide +- Update API reference + +--- + +## Implementation Order and Dependencies + +``` +Phase 1: Core Type System Foundation +├── 1.1 Expand Core Type Mappings (no deps) +├── 1.2 Enhance AttributeType with Store Parameter (no deps) +└── 1.3 Update Heading and Declaration Parsing (depends on 1.2) + +Phase 2: Content-Addressed Storage +├── 2.1 Create ContentRegistry Table (no deps) +├── 2.2 Implement ContentType (depends on 2.1) +└── 2.3 Content Storage Backend Methods (no deps) + +Phase 3: User-Defined AttributeTypes (depends on Phase 2) +├── 3.1 Implement XBlobType (depends on 2.2) +├── 3.2 Implement AttachType and XAttachType (depends on 2.2) +└── 3.3 Implement FilepathType (no deps) + +Phase 4: Insert and Fetch Integration (depends on Phases 1-3) +├── 4.1 Update Insert Processing +├── 4.2 Update Fetch Processing +└── 4.3 Update Heading Attribute Properties + +Phase 5: Garbage Collection (depends on Phase 2) +├── 5.1 Implement GC Scanner +└── 5.2 Add GC CLI Commands + +Phase 6: Migration Utilities (depends on Phases 2-4) +├── 6.1 Enhance Migration Module +└── 6.2 Create Migration CLI + +Phase 7: Documentation and Testing (ongoing) +``` + +--- + +## Critical Files Summary + +| File | Changes | +|------|---------| +| `src/datajoint/attribute_type.py` | All new AttributeTypes: `ContentType`, `XBlobType`, `AttachType`, `XAttachType`, `FilepathType` | +| `src/datajoint/declare.py` | Type pattern parsing, SQL generation, `` syntax | +| `src/datajoint/heading.py` | Attribute metadata, composed type information | +| `src/datajoint/table.py` | Insert logic with type composition | +| `src/datajoint/fetch.py` | Fetch logic with type chain decoding | +| `src/datajoint/content_registry.py` | **New**: ContentRegistry table and methods | +| `src/datajoint/gc.py` | **New**: Garbage collection scanner | +| `src/datajoint/migrate.py` | Migration utilities | + +--- + +## Risk Mitigation + +### Backward Compatibility +1. All existing types (`longblob`, `blob@store`, `attach@store`, `filepath@store`) continue to work +2. Legacy `~external_*` tables remain functional during transition +3. Implicit blob serialization preserved for existing schemas +4. Migration is opt-in and reversible + +### Performance Considerations +1. Content hashing uses SHA256 (fast, widely supported) +2. Deduplication reduces storage costs +3. Lazy ObjectRef prevents unnecessary I/O +4. GC runs on-demand, not automatically + +### Error Handling +1. Content hash verification on fetch (optional) +2. Graceful handling of missing content +3. Transaction safety for multi-table operations +4. Clear error messages for misconfiguration + +--- + +## Estimated Effort + +| Phase | Estimated Days | Risk | +|-------|----------------|------| +| Phase 1 | 3-4 days | Low | +| Phase 2 | 4-5 days | Medium | +| Phase 3 | 3-4 days | Low | +| Phase 4 | 4-5 days | Medium | +| Phase 5 | 2-3 days | Low | +| Phase 6 | 3-4 days | Medium | +| Phase 7 | 5-7 days | Low | +| **Total** | **24-32 days** | | From 979f45bcc606361134ce45a42f4304cc71e4fbea Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 18:58:34 +0000 Subject: [PATCH 24/42] Implement Phase 1: Core type system with store parameter support Phase 1.1 - Core type mappings already complete in declare.py Phase 1.2 - Enhanced AttributeType with store parameter support: - Added parse_type_spec() to parse "" into (type_name, store_name) - Updated get_type() to handle parameterized types - Updated is_type_registered() to ignore store parameters - Updated resolve_dtype() to propagate store through type chains - Returns (final_dtype, type_chain, store_name) tuple - Store from outer type overrides inner type's store Phase 1.3 - Updated heading and declaration parsing: - Updated get_adapter() to return (adapter, store_name) tuple - Updated substitute_special_type() to capture store from ADAPTED types - Store parameter is now properly passed through type resolution Co-authored-by: dimitri-yatsenko --- src/datajoint/attribute_adapter.py | 28 ++++----- src/datajoint/attribute_type.py | 92 +++++++++++++++++++++++------- src/datajoint/declare.py | 5 +- 3 files changed, 91 insertions(+), 34 deletions(-) diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py index 7df566a58..cf00b8e4b 100644 --- a/src/datajoint/attribute_adapter.py +++ b/src/datajoint/attribute_adapter.py @@ -12,7 +12,7 @@ import warnings from typing import Any -from .attribute_type import AttributeType, get_type, is_type_registered +from .attribute_type import AttributeType, get_type, is_type_registered, parse_type_spec from .errors import DataJointError # Pattern to detect blob types for internal pack/unpack @@ -154,7 +154,7 @@ def get(self, value: Any) -> Any: raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()") -def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: +def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: """ Get an attribute type/adapter by name. @@ -165,47 +165,49 @@ def get_adapter(context: dict | None, adapter_name: str) -> AttributeType: Args: context: Schema context dictionary (for legacy adapters). adapter_name: The adapter/type name, with or without angle brackets. + May include store parameter (e.g., ""). Returns: - The AttributeType instance. + Tuple of (AttributeType instance, store_name or None). Raises: DataJointError: If the adapter is not found or invalid. """ - adapter_name = adapter_name.lstrip("<").rstrip(">") + # Parse type name and optional store parameter + type_name, store_name = parse_type_spec(adapter_name) # First, check the global type registry (new system) - if is_type_registered(adapter_name): - return get_type(adapter_name) + if is_type_registered(type_name): + return get_type(type_name), store_name # Fall back to context-based lookup (legacy system) if context is None: raise DataJointError( - f"Attribute type <{adapter_name}> is not registered. " "Use @dj.register_type to register custom types." + f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types." ) try: - adapter = context[adapter_name] + adapter = context[type_name] except KeyError: raise DataJointError( - f"Attribute type <{adapter_name}> is not defined. " + f"Attribute type <{type_name}> is not defined. " "Register it with @dj.register_type or include it in the schema context." ) # Validate it's an AttributeType (or legacy AttributeAdapter) if not isinstance(adapter, AttributeType): raise DataJointError( - f"Attribute adapter '{adapter_name}' must be an instance of " + f"Attribute adapter '{type_name}' must be an instance of " "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)" ) # For legacy adapters from context, store the name they were looked up by if isinstance(adapter, AttributeAdapter): - adapter._type_name = adapter_name + adapter._type_name = type_name # Validate the dtype/attribute_type dtype = adapter.dtype if not isinstance(dtype, str) or not re.match(r"^\w", dtype): - raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{adapter_name}>") + raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{type_name}>") - return adapter + return adapter, store_name diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index 9be2d2214..97ca54646 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -242,6 +242,32 @@ class GraphType(dj.AttributeType): return cls +def parse_type_spec(spec: str) -> tuple[str, str | None]: + """ + Parse a type specification into type name and optional store parameter. + + Handles formats like: + - "" -> ("xblob", None) + - "" -> ("xblob", "cold") + - "xblob@cold" -> ("xblob", "cold") + - "xblob" -> ("xblob", None) + + Args: + spec: Type specification string, with or without angle brackets. + + Returns: + Tuple of (type_name, store_name). store_name is None if not specified. + """ + # Strip angle brackets + spec = spec.strip("<>").strip() + + if "@" in spec: + type_name, store_name = spec.split("@", 1) + return type_name.strip(), store_name.strip() + + return spec, None + + def unregister_type(name: str) -> None: """ Remove a type from the registry. @@ -269,6 +295,7 @@ def get_type(name: str) -> AttributeType: Args: name: The type name, with or without angle brackets. + Store parameters (e.g., "") are stripped. Returns: The registered AttributeType instance. @@ -276,20 +303,22 @@ def get_type(name: str) -> AttributeType: Raises: DataJointError: If the type is not found. """ - name = name.strip("<>") + # Strip angle brackets and store parameter + type_name, _ = parse_type_spec(name) # Check explicit registry first - if name in _type_registry: - return _type_registry[name] + if type_name in _type_registry: + return _type_registry[type_name] # Lazy-load entry points _load_entry_points() - if name in _type_registry: - return _type_registry[name] + if type_name in _type_registry: + return _type_registry[type_name] raise DataJointError( - f"Unknown attribute type: <{name}>. " f"Ensure the type is registered via @dj.register_type or installed as a package." + f"Unknown attribute type: <{type_name}>. " + f"Ensure the type is registered via @dj.register_type or installed as a package." ) @@ -309,16 +338,16 @@ def is_type_registered(name: str) -> bool: Check if a type name is registered. Args: - name: The type name to check. + name: The type name to check (store parameters are ignored). Returns: True if the type is registered. """ - name = name.strip("<>") - if name in _type_registry: + type_name, _ = parse_type_spec(name) + if type_name in _type_registry: return True _load_entry_points() - return name in _type_registry + return type_name in _type_registry def _load_entry_points() -> None: @@ -368,23 +397,37 @@ def _load_entry_points() -> None: logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}") -def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[AttributeType]]: +def resolve_dtype( + dtype: str, seen: set[str] | None = None, store_name: str | None = None +) -> tuple[str, list[AttributeType], str | None]: """ Resolve a dtype string, following type chains. If dtype references another custom type (e.g., ""), recursively - resolves to find the ultimate storage type. + resolves to find the ultimate storage type. Store parameters are propagated + through the chain. Args: - dtype: The dtype string to resolve. + dtype: The dtype string to resolve (e.g., "", "", "longblob"). seen: Set of already-seen type names (for cycle detection). + store_name: Store name from outer type specification (propagated inward). Returns: - Tuple of (final_storage_type, list_of_types_in_chain). + Tuple of (final_storage_type, list_of_types_in_chain, resolved_store_name). The chain is ordered from outermost to innermost type. Raises: DataJointError: If a circular type reference is detected. + + Examples: + >>> resolve_dtype("") + ("json", [XBlobType, ContentType], None) + + >>> resolve_dtype("") + ("json", [XBlobType, ContentType], "cold") + + >>> resolve_dtype("longblob") + ("longblob", [], None) """ if seen is None: seen = set() @@ -393,7 +436,10 @@ def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[A # Check if dtype is a custom type reference if dtype.startswith("<") and dtype.endswith(">"): - type_name = dtype[1:-1] + type_name, dtype_store = parse_type_spec(dtype) + + # Store from this level overrides inherited store + effective_store = dtype_store if dtype_store is not None else store_name if type_name in seen: raise DataJointError(f"Circular type reference detected: <{type_name}>") @@ -402,13 +448,19 @@ def resolve_dtype(dtype: str, seen: set[str] | None = None) -> tuple[str, list[A attr_type = get_type(type_name) chain.append(attr_type) - # Recursively resolve the inner dtype - inner_dtype, inner_chain = resolve_dtype(attr_type.dtype, seen) + # Recursively resolve the inner dtype, propagating store + inner_dtype, inner_chain, resolved_store = resolve_dtype(attr_type.dtype, seen, effective_store) chain.extend(inner_chain) - return inner_dtype, chain + return inner_dtype, chain, resolved_store + + # Not a custom type - check if it has a store suffix (e.g., "blob@store") + if "@" in dtype: + base_type, dtype_store = dtype.split("@", 1) + effective_store = dtype_store if dtype_store else store_name + return base_type, chain, effective_store - # Not a custom type - return as-is - return dtype, chain + # Plain type - return as-is with propagated store + return dtype, chain, store_name # ============================================================================= diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 397da108b..8aabddc00 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -489,7 +489,10 @@ def substitute_special_type(match, category, foreign_key_sql, context): "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match) ) elif category == "ADAPTED": - attr_type = get_adapter(context, match["type"]) + attr_type, store_name = get_adapter(context, match["type"]) + # Store the store parameter if present + if store_name is not None: + match["store"] = store_name match["type"] = attr_type.dtype category = match_type(match["type"]) if category in SPECIAL_TYPES: From 6926c5871a9be749b8bcf06eb2616c34d084844d Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 20:08:48 +0000 Subject: [PATCH 25/42] Remove legacy AttributeAdapter support, update tests for AttributeType - Remove AttributeAdapter class and context-based lookup from attribute_adapter.py - Simplify attribute_adapter.py to compatibility shim that re-exports from attribute_type - Remove AttributeAdapter from package exports in __init__.py - Update tests/schema_adapted.py to use @dj.register_type decorator - Update tests/test_adapted_attributes.py to work with globally registered types - Remove test_attribute_adapter_deprecated test from test_attribute_type.py Types are now registered globally via @dj.register_type decorator, eliminating the need for context-based adapter lookup. Co-authored-by: dimitri-yatsenko --- src/datajoint/__init__.py | 2 - src/datajoint/attribute_adapter.py | 203 +++-------------------------- tests/schema_adapted.py | 42 +++--- tests/test_adapted_attributes.py | 29 +---- tests/test_attribute_type.py | 7 - 5 files changed, 44 insertions(+), 239 deletions(-) diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 405134630..a19aae6d0 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -48,7 +48,6 @@ "AttributeType", "register_type", "list_types", - "AttributeAdapter", # Deprecated, use AttributeType "errors", "migrate", "DataJointError", @@ -62,7 +61,6 @@ from . import errors from . import migrate from .admin import kill -from .attribute_adapter import AttributeAdapter from .attribute_type import AttributeType, list_types, register_type from .blob import MatCell, MatStruct from .cli import cli diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py index cf00b8e4b..c92618f9e 100644 --- a/src/datajoint/attribute_adapter.py +++ b/src/datajoint/attribute_adapter.py @@ -1,213 +1,42 @@ """ -Legacy attribute adapter module. +Attribute adapter module - compatibility shim. -This module provides backward compatibility for the deprecated AttributeAdapter class. -New code should use :class:`datajoint.AttributeType` instead. +This module re-exports functions from attribute_type for backward compatibility +with code that imports from attribute_adapter. .. deprecated:: 0.15 - Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods. + Import directly from :mod:`datajoint.attribute_type` instead. """ -import re -import warnings -from typing import Any - -from .attribute_type import AttributeType, get_type, is_type_registered, parse_type_spec +from .attribute_type import ( + AttributeType, + get_type, + is_type_registered, + parse_type_spec, +) from .errors import DataJointError -# Pattern to detect blob types for internal pack/unpack -_BLOB_PATTERN = re.compile(r"^(tiny|small|medium|long|)blob", re.I) - - -class AttributeAdapter(AttributeType): - """ - Legacy base class for attribute adapters. - - .. deprecated:: 0.15 - Use :class:`datajoint.AttributeType` with ``encode``/``decode`` methods instead. - - This class provides backward compatibility for existing adapters that use - the ``attribute_type``, ``put()``, and ``get()`` API. - - Migration guide:: - - # Old style (deprecated): - class GraphAdapter(dj.AttributeAdapter): - attribute_type = "longblob" - - def put(self, graph): - return list(graph.edges) - - def get(self, edges): - return nx.Graph(edges) - - # New style (recommended): - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, graph, *, key=None): - return list(graph.edges) - - def decode(self, edges, *, key=None): - return nx.Graph(edges) - """ - - # Subclasses can set this as a class attribute instead of property - attribute_type: str = None # type: ignore - - def __init__(self): - # Emit deprecation warning on instantiation - warnings.warn( - f"{self.__class__.__name__} uses the deprecated AttributeAdapter API. " - "Migrate to AttributeType with encode/decode methods.", - DeprecationWarning, - stacklevel=2, - ) - - @property - def type_name(self) -> str: - """ - Infer type name from class name for legacy adapters. - - Legacy adapters were identified by their variable name in the context dict, - not by a property. For backward compatibility, we use the lowercase class name. - """ - # Check if a _type_name was explicitly set (for context-based lookup) - if hasattr(self, "_type_name"): - return self._type_name - # Fall back to class name - return self.__class__.__name__.lower() - - @property - def dtype(self) -> str: - """Map legacy attribute_type to new dtype property.""" - attr_type = self.attribute_type - if attr_type is None: - raise NotImplementedError( - f"{self.__class__.__name__} must define 'attribute_type' " "(or migrate to AttributeType with 'dtype')" - ) - return attr_type - - def _is_blob_dtype(self) -> bool: - """Check if dtype is a blob type requiring pack/unpack.""" - return bool(_BLOB_PATTERN.match(self.dtype)) - - def encode(self, value: Any, *, key: dict | None = None) -> Any: - """ - Delegate to legacy put() method, with blob packing if needed. - - Legacy adapters expect blob.pack to be called after put() when - the dtype is a blob type. This wrapper handles that automatically. - """ - result = self.put(value) - # Legacy adapters expect blob.pack after put() for blob dtypes - if self._is_blob_dtype(): - from . import blob - - result = blob.pack(result) - return result - - def decode(self, stored: Any, *, key: dict | None = None) -> Any: - """ - Delegate to legacy get() method, with blob unpacking if needed. - - Legacy adapters expect blob.unpack to be called before get() when - the dtype is a blob type. This wrapper handles that automatically. - """ - # Legacy adapters expect blob.unpack before get() for blob dtypes - if self._is_blob_dtype(): - from . import blob - - stored = blob.unpack(stored) - return self.get(stored) - - def put(self, obj: Any) -> Any: - """ - Convert an object of the adapted type into a storable value. - - .. deprecated:: 0.15 - Override ``encode()`` instead. - - Args: - obj: An object of the adapted type. - - Returns: - Value to store in the database. - """ - raise NotImplementedError(f"{self.__class__.__name__} must implement put() or migrate to encode()") - - def get(self, value: Any) -> Any: - """ - Convert a value from the database into the adapted type. - - .. deprecated:: 0.15 - Override ``decode()`` instead. - - Args: - value: Value from the database. - - Returns: - Object of the adapted type. - """ - raise NotImplementedError(f"{self.__class__.__name__} must implement get() or migrate to decode()") - def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: """ - Get an attribute type/adapter by name. - - This function provides backward compatibility by checking both: - 1. The global type registry (new system) - 2. The schema context dict (legacy system) + Get an attribute type by name. Args: - context: Schema context dictionary (for legacy adapters). - adapter_name: The adapter/type name, with or without angle brackets. + context: Ignored (legacy parameter, kept for API compatibility). + adapter_name: The type name, with or without angle brackets. May include store parameter (e.g., ""). Returns: Tuple of (AttributeType instance, store_name or None). Raises: - DataJointError: If the adapter is not found or invalid. + DataJointError: If the type is not found. """ # Parse type name and optional store parameter type_name, store_name = parse_type_spec(adapter_name) - # First, check the global type registry (new system) + # Look up in the global type registry if is_type_registered(type_name): return get_type(type_name), store_name - # Fall back to context-based lookup (legacy system) - if context is None: - raise DataJointError( - f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types." - ) - - try: - adapter = context[type_name] - except KeyError: - raise DataJointError( - f"Attribute type <{type_name}> is not defined. " - "Register it with @dj.register_type or include it in the schema context." - ) - - # Validate it's an AttributeType (or legacy AttributeAdapter) - if not isinstance(adapter, AttributeType): - raise DataJointError( - f"Attribute adapter '{type_name}' must be an instance of " - "datajoint.AttributeType (or legacy datajoint.AttributeAdapter)" - ) - - # For legacy adapters from context, store the name they were looked up by - if isinstance(adapter, AttributeAdapter): - adapter._type_name = type_name - - # Validate the dtype/attribute_type - dtype = adapter.dtype - if not isinstance(dtype, str) or not re.match(r"^\w", dtype): - raise DataJointError(f"Invalid dtype '{dtype}' in attribute type <{type_name}>") - - return adapter, store_name + raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.") diff --git a/tests/schema_adapted.py b/tests/schema_adapted.py index c7b5830c0..321edfc7b 100644 --- a/tests/schema_adapted.py +++ b/tests/schema_adapted.py @@ -7,40 +7,42 @@ import datajoint as dj -class GraphAdapter(dj.AttributeAdapter): - attribute_type = "longblob" # this is how the attribute will be declared +@dj.register_type +class GraphType(dj.AttributeType): + """Custom type for storing NetworkX graphs as edge lists.""" - @staticmethod - def get(obj): - # convert edge list into a graph - return nx.Graph(obj) + type_name = "graph" + dtype = "longblob" - @staticmethod - def put(obj): - # convert graph object into an edge list + def encode(self, obj, *, key=None): + """Convert graph object into an edge list.""" assert isinstance(obj, nx.Graph) return list(obj.edges) + def decode(self, stored, *, key=None): + """Convert edge list into a graph.""" + return nx.Graph(stored) -class LayoutToFilepath(dj.AttributeAdapter): - """ - An adapted data type that saves a graph layout into fixed filepath - """ - attribute_type = "filepath@repo-s3" +@dj.register_type +class LayoutToFilepathType(dj.AttributeType): + """Custom type that saves a graph layout to a filepath.""" - @staticmethod - def get(path): - with open(path, "r") as f: - return json.load(f) + type_name = "layout_to_filepath" + dtype = "filepath@repo-s3" - @staticmethod - def put(layout): + def encode(self, layout, *, key=None): + """Save layout to file and return path.""" path = Path(dj.config["stores"]["repo-s3"]["stage"], "layout.json") with open(str(path), "w") as f: json.dump(layout, f) return path + def decode(self, path, *, key=None): + """Load layout from file.""" + with open(path, "r") as f: + return json.load(f) + class Connectivity(dj.Manual): definition = """ diff --git a/tests/test_adapted_attributes.py b/tests/test_adapted_attributes.py index 0b4285ffb..eb5cd760d 100644 --- a/tests/test_adapted_attributes.py +++ b/tests/test_adapted_attributes.py @@ -1,10 +1,9 @@ """ Tests for adapted/custom attribute types. -These tests use the legacy AttributeAdapter API for backward compatibility testing. +These tests verify the AttributeType system for custom data types. """ -import warnings from itertools import zip_longest import networkx as nx @@ -15,40 +14,23 @@ from . import schema_adapted from .schema_adapted import Connectivity, Layout -# Filter deprecation warnings from legacy AttributeAdapter usage in these tests -pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") - @pytest.fixture def schema_name(prefix): return prefix + "_test_custom_datatype" -@pytest.fixture -def adapted_graph_instance(): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - yield schema_adapted.GraphAdapter() - - @pytest.fixture def schema_ad( connection_test, - adapted_graph_instance, enable_filepath_feature, s3_creds, tmpdir, schema_name, ): dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - layout_adapter = schema_adapted.LayoutToFilepath() - context = { - **schema_adapted.LOCALS_ADAPTED, - "graph": adapted_graph_instance, - "layout_to_filepath": layout_adapter, - } + # Types are registered globally via @dj.register_type decorator in schema_adapted + context = {**schema_adapted.LOCALS_ADAPTED} schema = dj.schema(schema_name, context=context, connection=connection_test) schema(schema_adapted.Connectivity) schema(schema_adapted.Layout) @@ -66,9 +48,10 @@ def local_schema(schema_ad, schema_name): @pytest.fixture -def schema_virtual_module(schema_ad, adapted_graph_instance, schema_name): +def schema_virtual_module(schema_ad, schema_name): """Fixture for testing virtual modules""" - schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, add_objects={"graph": adapted_graph_instance}) + # Types are registered globally, no need to add_objects for adapters + schema_virtual_module = dj.VirtualModule("virtual_module", schema_name) return schema_virtual_module diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py index f8f822a60..e9220bfd4 100644 --- a/tests/test_attribute_type.py +++ b/tests/test_attribute_type.py @@ -340,12 +340,6 @@ def test_exports_from_datajoint(self): assert hasattr(dj, "register_type") assert hasattr(dj, "list_types") - def test_attribute_adapter_deprecated(self): - """Test that AttributeAdapter is still available but deprecated.""" - assert hasattr(dj, "AttributeAdapter") - # AttributeAdapter should be a subclass of AttributeType - assert issubclass(dj.AttributeAdapter, dj.AttributeType) - class TestDJBlobType: """Tests for the built-in DJBlobType.""" @@ -405,7 +399,6 @@ def test_djblob_handles_serialization(self): With the new design: - Plain longblob columns store/return raw bytes (no serialization) - handles pack/unpack in encode/decode - - Legacy AttributeAdapter handles pack/unpack internally for backward compat """ blob_type = get_type("djblob") From 97bc16260cf701a90ebc4df1e26f175ca8d58ed4 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 20:29:02 +0000 Subject: [PATCH 26/42] Simplify core type system: remove SERIALIZED_TYPES, clarify blob semantics Core types (uuid, json, blob) now map directly to native database types without any implicit serialization. Serialization is handled by AttributeTypes like via encode()/decode() methods. Changes: - Rename SERIALIZED_TYPES to BINARY_TYPES in declare.py (clearer naming) - Update check for default values in compile_attribute() - Clarify in spec that core blob types store raw bytes Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 7 +++++-- src/datajoint/declare.py | 11 ++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 3d70c908e..a962ee6c8 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -89,10 +89,13 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty ### Binary Types +Core binary types store raw bytes without any serialization. Use `` AttributeType +for serialized Python objects. + | Core Type | Description | MySQL | PostgreSQL | |-----------|-------------|-------|------------| -| `blob` | Binary up to 64KB | `BLOB` | `BYTEA` | -| `longblob` | Binary up to 4GB | `LONGBLOB` | `BYTEA` | +| `blob` | Raw bytes up to 64KB | `BLOB` | `BYTEA` | +| `longblob` | Raw bytes up to 4GB | `LONGBLOB` | `BYTEA` | ### Special Types diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 8aabddc00..df89dede2 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -87,14 +87,15 @@ "EXTERNAL_BLOB", "FILEPATH", } # data referenced by a UUID in external tables -SERIALIZED_TYPES = { +# Blob and attachment types cannot have SQL default values (other than NULL) +BINARY_TYPES = { "EXTERNAL_ATTACH", "INTERNAL_ATTACH", "EXTERNAL_BLOB", "INTERNAL_BLOB", -} # requires packing data +} -assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, SERIALIZED_TYPES) <= set(TYPE_PATTERN) +assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, BINARY_TYPES) <= set(TYPE_PATTERN) def match_type(attribute_type): @@ -549,12 +550,12 @@ def compile_attribute(line, in_key, foreign_key_sql, context): match["comment"] = ":{type}:{comment}".format(**match) # insert custom type into comment substitute_special_type(match, category, foreign_key_sql, context) - if category in SERIALIZED_TYPES and match["default"] not in { + if category in BINARY_TYPES and match["default"] not in { "DEFAULT NULL", "NOT NULL", }: raise DataJointError( - "The default value for a blob or attachment attributes can only be NULL in:\n{line}".format(line=line) + "The default value for blob or attachment attributes can only be NULL in:\n{line}".format(line=line) ) sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match) From 2de222ad8e94307b4c9049aab08e9e4c23e5b487 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 20:42:28 +0000 Subject: [PATCH 27/42] Simplify type system: only core types and AttributeTypes Major simplification of the type system to two categories: 1. Core DataJoint types (no brackets): float32, uuid, bool, json, blob, etc. 2. AttributeTypes (angle brackets): , , , etc. Changes: - declare.py: Remove EXTERNAL_TYPES, BINARY_TYPES; simplify to CORE_TYPE_ALIASES + ADAPTED - heading.py: Remove is_attachment, is_filepath, is_object, is_external flags - fetch.py: Simplify _get() to only handle uuid, json, blob, and adapters - table.py: Simplify __make_placeholder() to only handle uuid, json, blob, numeric - preview.py: Remove special object field handling (will be AttributeType) - staged_insert.py: Update object type check to use adapter All special handling (attach, filepath, object, external storage) will be implemented as built-in AttributeTypes in subsequent phases. Co-authored-by: dimitri-yatsenko --- src/datajoint/declare.py | 117 +++++++++++---------------------- src/datajoint/fetch.py | 110 +++++++++++-------------------- src/datajoint/heading.py | 103 ++++++++++------------------- src/datajoint/preview.py | 6 +- src/datajoint/staged_insert.py | 5 +- src/datajoint/table.py | 51 +++++++------- 6 files changed, 140 insertions(+), 252 deletions(-) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index df89dede2..a333d5f87 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -11,13 +11,13 @@ from .attribute_adapter import get_adapter from .condition import translate_attribute -from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types +from .errors import DataJointError from .settings import config -UUID_DATA_TYPE = "binary(16)" - -# Type aliases for numeric types -SQL_TYPE_ALIASES = { +# Core DataJoint type aliases - scientist-friendly names mapped to native SQL types +# These types can be used without angle brackets in table definitions +CORE_TYPE_ALIASES = { + # Numeric types "FLOAT32": "float", "FLOAT64": "double", "INT64": "bigint", @@ -29,18 +29,22 @@ "INT8": "tinyint", "UINT8": "tinyint unsigned", "BOOL": "tinyint", + # UUID type + "UUID": "binary(16)", } + MAX_TABLE_NAME_LENGTH = 64 CONSTANT_LITERALS = { "CURRENT_TIMESTAMP", "NULL", } # SQL literals to be used without quotes (case insensitive) -EXTERNAL_TABLE_ROOT = "~external" +# Type patterns for declaration parsing +# Two categories: core type aliases and native passthrough types TYPE_PATTERN = { k: re.compile(v, re.I) for k, v in dict( - # Type aliases must come before INTEGER and FLOAT patterns to avoid prefix matching + # Core DataJoint type aliases (scientist-friendly names) FLOAT32=r"float32$", FLOAT64=r"float64$", INT64=r"int64$", @@ -51,8 +55,9 @@ UINT16=r"uint16$", INT8=r"int8$", UINT8=r"uint8$", - BOOL=r"bool$", # aliased to tinyint - # Native MySQL types + BOOL=r"bool$", + UUID=r"uuid$", + # Native SQL types (passthrough) INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$", DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$", FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$", @@ -60,42 +65,19 @@ JSON=r"json$", ENUM=r"enum\s*\(.+\)$", TEMPORAL=r"(date|datetime|time|timestamp|year)(\s*\(.+\))?$", - INTERNAL_BLOB=r"(tiny|small|medium|long|)blob$", - EXTERNAL_BLOB=r"blob@(?P[a-z][\-\w]*)$", - INTERNAL_ATTACH=r"attach$", - EXTERNAL_ATTACH=r"attach@(?P[a-z][\-\w]*)$", - FILEPATH=r"filepath@(?P[a-z][\-\w]*)$", - OBJECT=r"object(@(?P[a-z][\-\w]*))?$", # managed object storage (files/folders) - UUID=r"uuid$", + BLOB=r"(tiny|small|medium|long|)blob$", + # AttributeTypes use angle brackets ADAPTED=r"<.+>$", ).items() } -# custom types are stored in attribute comment -SPECIAL_TYPES = { - "UUID", - "INTERNAL_ATTACH", - "EXTERNAL_ATTACH", - "EXTERNAL_BLOB", - "FILEPATH", - "OBJECT", - "ADAPTED", -} | set(SQL_TYPE_ALIASES) +# Types that require special handling (stored in attribute comment for reconstruction) +SPECIAL_TYPES = {"ADAPTED"} | set(CORE_TYPE_ALIASES) + +# Native SQL types that pass through without modification NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES -EXTERNAL_TYPES = { - "EXTERNAL_ATTACH", - "EXTERNAL_BLOB", - "FILEPATH", -} # data referenced by a UUID in external tables -# Blob and attachment types cannot have SQL default values (other than NULL) -BINARY_TYPES = { - "EXTERNAL_ATTACH", - "INTERNAL_ATTACH", - "EXTERNAL_BLOB", - "INTERNAL_BLOB", -} -assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, BINARY_TYPES) <= set(TYPE_PATTERN) +assert SPECIAL_TYPES <= set(TYPE_PATTERN) def match_type(attribute_type): @@ -459,50 +441,32 @@ def format_attribute(attr): def substitute_special_type(match, category, foreign_key_sql, context): """ + Substitute special types with their native SQL equivalents. + + Special types are: + - Core type aliases (float32 → float, uuid → binary(16), etc.) + - ADAPTED types (AttributeTypes in angle brackets) + :param match: dict containing with keys "type" and "comment" -- will be modified in place :param category: attribute type category from TYPE_PATTERN :param foreign_key_sql: list of foreign key declarations to add to :param context: context for looking up user-defined attribute_type adapters """ - if category == "UUID": - match["type"] = UUID_DATA_TYPE - elif category == "INTERNAL_ATTACH": - match["type"] = "LONGBLOB" - elif category == "OBJECT": - # Object type stores metadata as JSON - no foreign key to external table - # Extract store name if present (object@store_name syntax) - if "@" in match["type"]: - match["store"] = match["type"].split("@", 1)[1] - match["type"] = "JSON" - elif category in EXTERNAL_TYPES: - if category == "FILEPATH" and not _support_filepath_types(): - raise DataJointError( - """ - The filepath data type is disabled until complete validation. - To turn it on as experimental feature, set the environment variable - {env} = TRUE or upgrade datajoint. - """.format(env=FILEPATH_FEATURE_SWITCH) - ) - match["store"] = match["type"].split("@", 1)[1] - match["type"] = UUID_DATA_TYPE - foreign_key_sql.append( - "FOREIGN KEY (`{name}`) REFERENCES `{{database}}`.`{external_table_root}_{store}` (`hash`) " - "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match) - ) - elif category == "ADAPTED": + if category == "ADAPTED": + # AttributeType - resolve to underlying dtype attr_type, store_name = get_adapter(context, match["type"]) - # Store the store parameter if present if store_name is not None: match["store"] = store_name match["type"] = attr_type.dtype + # Recursively resolve if dtype is also a special type category = match_type(match["type"]) if category in SPECIAL_TYPES: - # recursive redefinition from user-defined datatypes. substitute_special_type(match, category, foreign_key_sql, context) - elif category in SQL_TYPE_ALIASES: - match["type"] = SQL_TYPE_ALIASES[category] + elif category in CORE_TYPE_ALIASES: + # Core type alias - substitute with native SQL type + match["type"] = CORE_TYPE_ALIASES[category] else: - assert False, "Unknown special type" + assert False, f"Unknown special type: {category}" def compile_attribute(line, in_key, foreign_key_sql, context): @@ -513,7 +477,7 @@ def compile_attribute(line, in_key, foreign_key_sql, context): :param in_key: set to True if attribute is in primary key set :param foreign_key_sql: the list of foreign key declarations to add to :param context: context in which to look up user-defined attribute type adapterss - :returns: (name, sql, is_external) -- attribute name and sql code for its declaration + :returns: (name, sql, store) -- attribute name, sql code for its declaration, and optional store name """ try: match = attribute_parser.parseString(line + "#", parseAll=True) @@ -550,13 +514,10 @@ def compile_attribute(line, in_key, foreign_key_sql, context): match["comment"] = ":{type}:{comment}".format(**match) # insert custom type into comment substitute_special_type(match, category, foreign_key_sql, context) - if category in BINARY_TYPES and match["default"] not in { - "DEFAULT NULL", - "NOT NULL", - }: - raise DataJointError( - "The default value for blob or attachment attributes can only be NULL in:\n{line}".format(line=line) - ) + # Check for invalid default values on blob types (after type substitution) + final_category = match_type(match["type"]) + if final_category == "BLOB" and match["default"] not in {"DEFAULT NULL", "NOT NULL"}: + raise DataJointError("The default value for blob attributes can only be NULL in:\n{line}".format(line=line)) sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match) return match["name"], sql, match.get("store") diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index e1b655fc0..000ab0bfd 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -1,21 +1,15 @@ -import itertools import json import numbers -import uuid +import uuid as uuid_module from functools import partial -from pathlib import Path import numpy as np import pandas from datajoint.condition import Top -from . import hash from .errors import DataJointError -from .objectref import ObjectRef from .settings import config -from .storage import StorageBackend -from .utils import safe_write class key: @@ -39,79 +33,51 @@ def to_dicts(recarray): def _get(connection, attr, data, squeeze, download_path): """ - This function is called for every attribute + Retrieve and decode attribute data from the database. + + In the simplified type system: + - Native types pass through unchanged + - JSON types are parsed + - UUID types are converted from bytes + - Blob types return raw bytes (unless an adapter handles them) + - Adapters (AttributeTypes) handle all custom encoding/decoding :param connection: a dj.Connection object - :param attr: attribute name from the table's heading - :param data: literal value fetched from the table - :param squeeze: if True squeeze blobs - :param download_path: for fetches that download data, e.g. attachments - :return: unpacked data + :param attr: attribute from the table's heading + :param data: raw value fetched from the database + :param squeeze: if True squeeze blobs (legacy, unused) + :param download_path: for fetches that download data (legacy, unused in simplified model) + :return: decoded data """ if data is None: - return - if attr.is_object: - # Object type - return ObjectRef handle - json_data = json.loads(data) if isinstance(data, str) else data - # Get the correct backend based on store name in metadata - store_name = json_data.get("store") # None for default store - try: - spec = config.get_object_store_spec(store_name) - backend = StorageBackend(spec) - except DataJointError: - backend = None - return ObjectRef.from_json(json_data, backend=backend) + return None + + # JSON type - parse and optionally decode via adapter if attr.json: - return json.loads(data) - - extern = connection.schemas[attr.database].external[attr.store] if attr.is_external else None - - # apply custom attribute type decoder if present - def adapt(x): - return attr.adapter.decode(x, key=None) if attr.adapter else x - - if attr.is_filepath: - return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0]) - if attr.is_attachment: - # Steps: - # 1. get the attachment filename - # 2. check if the file already exists at download_path, verify checksum - # 3. if exists and checksum passes then return the local filepath - # 4. Otherwise, download the remote file and return the new filepath - _uuid = uuid.UUID(bytes=data) if attr.is_external else None - attachment_name = extern.get_attachment_name(_uuid) if attr.is_external else data.split(b"\0", 1)[0].decode() - local_filepath = Path(download_path) / attachment_name - if local_filepath.is_file(): - attachment_checksum = _uuid if attr.is_external else hash.uuid_from_buffer(data) - if attachment_checksum == hash.uuid_from_file(local_filepath, init_string=attachment_name + "\0"): - return adapt(str(local_filepath)) # checksum passed, no need to download again - # generate the next available alias filename - for n in itertools.count(): - f = local_filepath.parent / (local_filepath.stem + "_%04x" % n + local_filepath.suffix) - if not f.is_file(): - local_filepath = f - break - if attachment_checksum == hash.uuid_from_file(f, init_string=attachment_name + "\0"): - return adapt(str(f)) # checksum passed, no need to download again - # Save attachment - if attr.is_external: - extern.download_attachment(_uuid, attachment_name, local_filepath) - else: - # write from buffer - safe_write(local_filepath, data.split(b"\0", 1)[1]) - return adapt(str(local_filepath)) # download file from remote store + parsed = json.loads(data) + if attr.adapter: + return attr.adapter.decode(parsed, key=None) + return parsed + # UUID type - convert bytes to UUID object if attr.uuid: - return adapt(uuid.UUID(bytes=data)) - elif attr.is_blob: - blob_data = extern.get(uuid.UUID(bytes=data)) if attr.is_external else data - # Adapters (like ) handle deserialization in decode() - # Without adapter, blob columns return raw bytes (no deserialization) + result = uuid_module.UUID(bytes=data) if attr.adapter: - return attr.adapter.decode(blob_data, key=None) - return blob_data # raw bytes - else: - return adapt(data) + return attr.adapter.decode(result, key=None) + return result + + # Blob type - return raw bytes or decode via adapter + if attr.is_blob: + if attr.adapter: + return attr.adapter.decode(data, key=None) + return data # raw bytes + + # Other types with adapter + if attr.adapter: + return attr.adapter.decode(data, key=None) + + # Native types - pass through unchanged + return data class Fetch: diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index cc8034cd7..07617004e 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -8,13 +8,11 @@ from .attribute_adapter import get_adapter from .attribute_type import AttributeType from .declare import ( - EXTERNAL_TYPES, - NATIVE_TYPES, + CORE_TYPE_ALIASES, SPECIAL_TYPES, TYPE_PATTERN, - UUID_DATA_TYPE, ) -from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types +from .errors import DataJointError class _MissingType(AttributeType): @@ -62,10 +60,6 @@ def decode(self, stored, *, key=None): uuid=False, json=None, is_blob=False, - is_attachment=False, - is_filepath=False, - is_object=False, - is_external=False, is_hidden=False, adapter=None, store=None, @@ -88,11 +82,13 @@ def todict(self): @property def sql_type(self): """:return: datatype (as string) in database. In most cases, it is the same as self.type""" - return UUID_DATA_TYPE if self.uuid else self.type + # UUID is now a core type alias - already resolved to binary(16) + return self.type @property def sql_comment(self): """:return: full comment for the SQL declaration. Includes custom type specification""" + # UUID info is stored in the comment for reconstruction return (":uuid:" if self.uuid else "") + self.comment @property @@ -167,17 +163,10 @@ def secondary_attributes(self): def blobs(self): return [k for k, v in self.attributes.items() if v.is_blob] - @property - def objects(self): - return [k for k, v in self.attributes.items() if v.is_object] - @property def non_blobs(self): - return [ - k - for k, v in self.attributes.items() - if not (v.is_blob or v.is_attachment or v.is_filepath or v.is_object or v.json) - ] + """Attributes that are not blobs or JSON (used for simple column handling).""" + return [k for k, v in self.attributes.items() if not (v.is_blob or v.json)] @property def new_attributes(self): @@ -298,15 +287,11 @@ def _init_from_database(self): autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)), numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")), string=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("ENUM", "TEMPORAL", "STRING")), - is_blob=bool(TYPE_PATTERN["INTERNAL_BLOB"].match(attr["type"])), + is_blob=bool(TYPE_PATTERN["BLOB"].match(attr["type"])), uuid=False, json=bool(TYPE_PATTERN["JSON"].match(attr["type"])), - is_attachment=False, - is_filepath=False, - is_object=False, adapter=None, store=None, - is_external=False, attribute_expression=None, is_hidden=attr["name"].startswith("_"), ) @@ -316,26 +301,34 @@ def _init_from_database(self): attr["unsupported"] = not any((attr["is_blob"], attr["numeric"], attr["numeric"])) attr.pop("Extra") - # process custom DataJoint types + # process custom DataJoint types stored in comment special = re.match(r":(?P[^:]+):(?P.*)", attr["comment"]) if special: special = special.groupdict() attr.update(special) - # process custom attribute types (adapted types) + + # process AttributeTypes (adapted types in angle brackets) if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): assert context is not None, "Declaration context is not set" adapter_name = special["type"] try: - attr.update(adapter=get_adapter(context, adapter_name)) + adapter_result = get_adapter(context, adapter_name) + # get_adapter returns (adapter, store_name) tuple + if isinstance(adapter_result, tuple): + attr["adapter"], attr["store"] = adapter_result + else: + attr["adapter"] = adapter_result except DataJointError: # if no adapter, then delay the error until the first invocation - attr.update(adapter=_MissingType(adapter_name)) + attr["adapter"] = _MissingType(adapter_name) else: - attr.update(type=attr["adapter"].dtype) + attr["type"] = attr["adapter"].dtype if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") - special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES) + # Update is_blob based on resolved dtype + attr["is_blob"] = bool(TYPE_PATTERN["BLOB"].match(attr["type"])) + # Handle core type aliases (uuid, float32, etc.) if special: try: category = next(c for c in SPECIAL_TYPES if TYPE_PATTERN[c].match(attr["type"])) @@ -350,46 +343,18 @@ def _init_from_database(self): url=url, **attr ) ) - raise DataJointError("Unknown attribute type `{type}`".format(**attr)) - if category == "FILEPATH" and not _support_filepath_types(): - raise DataJointError( - """ - The filepath data type is disabled until complete validation. - To turn it on as experimental feature, set the environment variable - {env} = TRUE or upgrade datajoint. - """.format(env=FILEPATH_FEATURE_SWITCH) - ) - # Extract store name for external types and object types with named stores - store = None - if category in EXTERNAL_TYPES: - store = attr["type"].split("@")[1] - elif category == "OBJECT" and "@" in attr["type"]: - store = attr["type"].split("@")[1] - - attr.update( - unsupported=False, - is_attachment=category in ("INTERNAL_ATTACH", "EXTERNAL_ATTACH"), - is_filepath=category == "FILEPATH", - is_object=category == "OBJECT", - # INTERNAL_BLOB is not a custom type but is included for completeness - is_blob=category in ("INTERNAL_BLOB", "EXTERNAL_BLOB"), - uuid=category == "UUID", - is_external=category in EXTERNAL_TYPES, - store=store, - ) + # Not a special type - that's fine, could be native passthrough + category = None - if attr["in_key"] and any( - ( - attr["is_blob"], - attr["is_attachment"], - attr["is_filepath"], - attr["is_object"], - attr["json"], - ) - ): - raise DataJointError( - "Json, Blob, attachment, filepath, or object attributes " "are not allowed in the primary key" - ) + if category == "UUID": + attr["uuid"] = True + elif category in CORE_TYPE_ALIASES: + # Core type alias - already resolved in DB + pass + + # Check primary key constraints + if attr["in_key"] and (attr["is_blob"] or attr["json"]): + raise DataJointError("Blob or JSON attributes are not allowed in the primary key") if attr["string"] and attr["default"] is not None and attr["default"] not in sql_literals: attr["default"] = '"%s"' % attr["default"] @@ -410,7 +375,7 @@ def _init_from_database(self): attr["dtype"] = numeric_types[(t, is_unsigned)] if attr["adapter"]: - # restore adapted type name + # restore adapted type name for display attr["type"] = adapter_name self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes)) diff --git a/src/datajoint/preview.py b/src/datajoint/preview.py index 5c61db1da..7572125e9 100644 --- a/src/datajoint/preview.py +++ b/src/datajoint/preview.py @@ -27,7 +27,8 @@ def _format_object_display(json_data): def preview(query_expression, limit, width): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - object_fields = heading.objects + # Object fields are AttributeTypes with adapters - not specially handled in simplified model + object_fields = [] if limit is None: limit = config["display.limit"] if width is None: @@ -87,7 +88,8 @@ def get_display_value(tup, f, idx): def repr_html(query_expression): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - object_fields = heading.objects + # Object fields are AttributeTypes with adapters - not specially handled in simplified model + object_fields = [] info = heading.table_status tuples = rel.fetch(limit=config["display.limit"] + 1, format="array") has_more = len(tuples) > config["display.limit"] diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py index 9083bb78b..3a3d5bd17 100644 --- a/src/datajoint/staged_insert.py +++ b/src/datajoint/staged_insert.py @@ -98,8 +98,9 @@ def _get_storage_path(self, field: str, ext: str = "") -> str: raise DataJointError(f"Attribute '{field}' not found in table heading") attr = self._table.heading[field] - if not attr.is_object: - raise DataJointError(f"Attribute '{field}' is not an object type") + # Check if this is an object AttributeType (has adapter with "object" in type_name) + if not (attr.adapter and hasattr(attr.adapter, "type_name") and "object" in attr.adapter.type_name): + raise DataJointError(f"Attribute '{field}' is not an type") # Extract primary key from rec primary_key = {k: self._rec[k] for k in self._table.primary_key if k in self._rec} diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 02374b9ff..170e06089 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -924,56 +924,49 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): as a string to be included in the query and the value, if any, to be submitted for processing by mysql API. + In the simplified type system: + - Adapters (AttributeTypes) handle all custom encoding + - UUID values are converted to bytes + - JSON values are serialized + - Blob values pass through as bytes + - Numeric values are stringified + :param name: name of attribute to be inserted :param value: value of attribute to be inserted :param ignore_extra_fields: if True, return None for unknown fields - :param row: the full row dict (needed for object attributes to extract primary key) + :param row: the full row dict (unused in simplified model) """ if ignore_extra_fields and name not in self.heading: return None attr = self.heading[name] + + # Apply adapter encoding first (if present) if attr.adapter: - # Custom attribute type: validate and encode attr.adapter.validate(value) value = attr.adapter.encode(value, key=None) + + # Handle NULL values if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))): - # set default value placeholder, value = "DEFAULT", None - else: # not NULL + else: placeholder = "%s" + # UUID - convert to bytes if attr.uuid: if not isinstance(value, uuid.UUID): try: value = uuid.UUID(value) except (AttributeError, ValueError): - raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name)) + raise DataJointError(f"badly formed UUID value {value} for attribute `{name}`") value = value.bytes - elif attr.is_blob: - # Adapters (like ) handle serialization in encode() - # Without adapter, blob columns store raw bytes (no serialization) - if attr.is_external: - value = self.external[attr.store].put(value).bytes - elif attr.is_attachment: - attachment_path = Path(value) - if attr.is_external: - # value is hash of contents - value = self.external[attr.store].upload_attachment(attachment_path).bytes - else: - # value is filename + contents - value = str.encode(attachment_path.name) + b"\0" + attachment_path.read_bytes() - elif attr.is_filepath: - value = self.external[attr.store].upload_filepath(value).bytes - elif attr.is_object: - # Object type - upload to object storage and return JSON metadata - if row is None: - raise DataJointError( - f"Object attribute {name} requires full row context for insert. " "This is an internal error." - ) - value = self._process_object_value(name, value, row, store_name=attr.store) - elif attr.numeric: - value = str(int(value) if isinstance(value, bool) else value) + # JSON - serialize to string elif attr.json: value = json.dumps(value) + # Numeric - convert to string + elif attr.numeric: + value = str(int(value) if isinstance(value, bool) else value) + # Blob - pass through as bytes (adapters handle serialization) + # elif attr.is_blob: pass through unchanged + return name, placeholder, value def __make_row_to_insert(self, row, field_list, ignore_extra_fields): From f35e027525535f004aac2a5c6f3bc7340302a5b6 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 20:51:16 +0000 Subject: [PATCH 28/42] =?UTF-8?q?Define=20complete=20core=20type=20system?= =?UTF-8?q?=20with=20blob=E2=86=92longblob=20mapping?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Core DataJoint types (fully supported, recorded in :type: comments): - Numeric: float32, float64, int64, uint64, int32, uint32, int16, uint16, int8, uint8 - Boolean: bool - UUID: uuid → binary(16) - JSON: json - Binary: blob → longblob - Temporal: date, datetime - String: char(n), varchar(n) - Enumeration: enum(...) Changes: - declare.py: Define CORE_TYPES with (pattern, sql_mapping) pairs - declare.py: Add warning for non-standard native type usage - heading.py: Update to use CORE_TYPE_NAMES - storage-types-spec.md: Update documentation to reflect core types Native database types (text, mediumint, etc.) pass through with a warning about non-standard usage. Co-authored-by: dimitri-yatsenko --- docs/src/design/tables/storage-types-spec.md | 87 +++++++------- src/datajoint/declare.py | 117 +++++++++++-------- src/datajoint/heading.py | 4 +- 3 files changed, 118 insertions(+), 90 deletions(-) diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index a962ee6c8..668fdfdf5 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -12,19 +12,20 @@ This document defines a three-layer type architecture: ┌───────────────────────────────────────────────────────────────────┐ │ AttributeTypes (Layer 3) │ │ │ -│ Built-in: │ +│ Built-in: │ │ User: ... │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types (Layer 2) │ │ │ -│ int8 int16 int32 int64 float32 float64 bool decimal │ -│ uint8 uint16 uint32 uint64 varchar char uuid date │ -│ json longblob blob timestamp datetime enum │ +│ float32 float64 int64 uint64 int32 uint32 int16 uint16 │ +│ int8 uint8 bool uuid json blob date datetime │ +│ char(n) varchar(n) enum(...) │ ├───────────────────────────────────────────────────────────────────┤ │ Native Database Types (Layer 1) │ │ │ │ MySQL: TINYINT SMALLINT INT BIGINT FLOAT DOUBLE ... │ │ PostgreSQL: SMALLINT INTEGER BIGINT REAL DOUBLE PRECISION │ +│ (pass through with warning for non-standard types) │ └───────────────────────────────────────────────────────────────────┘ ``` @@ -49,61 +50,65 @@ For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. Core types provide a standardized, scientist-friendly interface that works identically across MySQL and PostgreSQL backends. Users should prefer these over native database types. +**All core types are recorded in field comments using `:type:` syntax for reconstruction.** + ### Numeric Types -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` (clamped) | -| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` | -| `int32` | 32-bit signed | `INT` | `INTEGER` | -| `int64` | 64-bit signed | `BIGINT` | `BIGINT` | -| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` (checked) | -| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` (checked) | -| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` (checked) | -| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` | -| `float32` | 32-bit float | `FLOAT` | `REAL` | -| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` | -| `decimal(p,s)` | Fixed precision | `DECIMAL(p,s)` | `NUMERIC(p,s)` | +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `int8` | 8-bit signed | `TINYINT` | +| `int16` | 16-bit signed | `SMALLINT` | +| `int32` | 32-bit signed | `INT` | +| `int64` | 64-bit signed | `BIGINT` | +| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | +| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | +| `uint32` | 32-bit unsigned | `INT UNSIGNED` | +| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | +| `float32` | 32-bit float | `FLOAT` | +| `float64` | 64-bit float | `DOUBLE` | ### String Types -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` | -| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `char(n)` | Fixed-length | `CHAR(n)` | +| `varchar(n)` | Variable-length | `VARCHAR(n)` | ### Boolean -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `bool` | True/False | `TINYINT(1)` | `BOOLEAN` | +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `bool` | True/False | `TINYINT` | ### Date/Time Types -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `date` | Date only | `DATE` | `DATE` | -| `datetime` | Date and time | `DATETIME(6)` | `TIMESTAMP` | -| `timestamp` | Auto-updating | `TIMESTAMP` | `TIMESTAMP` | -| `time` | Time only | `TIME` | `TIME` | +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `date` | Date only | `DATE` | +| `datetime` | Date and time | `DATETIME` | ### Binary Types -Core binary types store raw bytes without any serialization. Use `` AttributeType +The core `blob` type stores raw bytes without any serialization. Use `` AttributeType for serialized Python objects. -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `blob` | Raw bytes up to 64KB | `BLOB` | `BYTEA` | -| `longblob` | Raw bytes up to 4GB | `LONGBLOB` | `BYTEA` | +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `blob` | Raw bytes | `LONGBLOB` | + +### Other Types + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `json` | JSON document | `JSON` | +| `uuid` | UUID | `BINARY(16)` | +| `enum(...)` | Enumeration | `ENUM(...)` | -### Special Types +### Native Passthrough Types -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `json` | JSON document | `JSON` | `JSONB` | -| `uuid` | UUID | `CHAR(36)` | `UUID` | -| `enum(...)` | Enumeration | `ENUM(...)` | `VARCHAR` + CHECK | +Users may use native database types directly (e.g., `text`, `mediumint auto_increment`), +but these will generate a warning about non-standard usage. Native types are not recorded +in field comments and may have portability issues across database backends. ## AttributeTypes (Layer 3) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index a333d5f87..c08a5fd4c 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -14,25 +14,44 @@ from .errors import DataJointError from .settings import config -# Core DataJoint type aliases - scientist-friendly names mapped to native SQL types -# These types can be used without angle brackets in table definitions -CORE_TYPE_ALIASES = { - # Numeric types - "FLOAT32": "float", - "FLOAT64": "double", - "INT64": "bigint", - "UINT64": "bigint unsigned", - "INT32": "int", - "UINT32": "int unsigned", - "INT16": "smallint", - "UINT16": "smallint unsigned", - "INT8": "tinyint", - "UINT8": "tinyint unsigned", - "BOOL": "tinyint", - # UUID type - "UUID": "binary(16)", +# Core DataJoint types - scientist-friendly names that are fully supported +# These are recorded in field comments using :type: syntax for reconstruction +# Format: pattern_name -> (regex_pattern, mysql_type or None if same as matched) +CORE_TYPES = { + # Numeric types (aliased to native SQL) + "float32": (r"float32$", "float"), + "float64": (r"float64$", "double"), + "int64": (r"int64$", "bigint"), + "uint64": (r"uint64$", "bigint unsigned"), + "int32": (r"int32$", "int"), + "uint32": (r"uint32$", "int unsigned"), + "int16": (r"int16$", "smallint"), + "uint16": (r"uint16$", "smallint unsigned"), + "int8": (r"int8$", "tinyint"), + "uint8": (r"uint8$", "tinyint unsigned"), + "bool": (r"bool$", "tinyint"), + # UUID (stored as binary) + "uuid": (r"uuid$", "binary(16)"), + # JSON + "json": (r"json$", None), # json passes through as-is + # Binary (blob maps to longblob) + "blob": (r"blob$", "longblob"), + # Temporal + "date": (r"date$", None), + "datetime": (r"datetime$", None), + # String types (with parameters) + "char": (r"char\s*\(\d+\)$", None), + "varchar": (r"varchar\s*\(\d+\)$", None), + # Enumeration + "enum": (r"enum\s*\(.+\)$", None), } +# Compile core type patterns +CORE_TYPE_PATTERNS = {name: re.compile(pattern, re.I) for name, (pattern, _) in CORE_TYPES.items()} + +# Get SQL mapping for core types +CORE_TYPE_SQL = {name: sql_type for name, (_, sql_type) in CORE_TYPES.items()} + MAX_TABLE_NAME_LENGTH = 64 CONSTANT_LITERALS = { "CURRENT_TIMESTAMP", @@ -40,47 +59,38 @@ } # SQL literals to be used without quotes (case insensitive) # Type patterns for declaration parsing -# Two categories: core type aliases and native passthrough types TYPE_PATTERN = { k: re.compile(v, re.I) for k, v in dict( - # Core DataJoint type aliases (scientist-friendly names) - FLOAT32=r"float32$", - FLOAT64=r"float64$", - INT64=r"int64$", - UINT64=r"uint64$", - INT32=r"int32$", - UINT32=r"uint32$", - INT16=r"int16$", - UINT16=r"uint16$", - INT8=r"int8$", - UINT8=r"uint8$", - BOOL=r"bool$", - UUID=r"uuid$", - # Native SQL types (passthrough) + # Core DataJoint types + **{name.upper(): pattern for name, (pattern, _) in CORE_TYPES.items()}, + # Native SQL types (passthrough with warning for non-standard use) INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$", DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$", FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$", - STRING=r"(var)?char\s*\(.+\)$", - JSON=r"json$", - ENUM=r"enum\s*\(.+\)$", - TEMPORAL=r"(date|datetime|time|timestamp|year)(\s*\(.+\))?$", - BLOB=r"(tiny|small|medium|long|)blob$", + STRING=r"(var)?char\s*\(.+\)$", # Catches char/varchar not matched by core types + TEMPORAL=r"(time|timestamp|year)(\s*\(.+\))?$", # time, timestamp, year (not date/datetime) + NATIVE_BLOB=r"(tiny|small|medium|long)blob$", # Specific blob variants + TEXT=r"(tiny|small|medium|long)?text$", # Text types # AttributeTypes use angle brackets ADAPTED=r"<.+>$", ).items() } -# Types that require special handling (stored in attribute comment for reconstruction) -SPECIAL_TYPES = {"ADAPTED"} | set(CORE_TYPE_ALIASES) +# Core types are stored in attribute comment for reconstruction +CORE_TYPE_NAMES = {name.upper() for name in CORE_TYPES} + +# Special types that need comment storage (core types + adapted) +SPECIAL_TYPES = CORE_TYPE_NAMES | {"ADAPTED"} -# Native SQL types that pass through without modification +# Native SQL types that pass through (with optional warning) NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES assert SPECIAL_TYPES <= set(TYPE_PATTERN) def match_type(attribute_type): + """Match an attribute type string to a category.""" try: return next(category for category, pattern in TYPE_PATTERN.items() if pattern.match(attribute_type)) except StopIteration: @@ -444,7 +454,7 @@ def substitute_special_type(match, category, foreign_key_sql, context): Substitute special types with their native SQL equivalents. Special types are: - - Core type aliases (float32 → float, uuid → binary(16), etc.) + - Core DataJoint types (float32 → float, uuid → binary(16), blob → longblob, etc.) - ADAPTED types (AttributeTypes in angle brackets) :param match: dict containing with keys "type" and "comment" -- will be modified in place @@ -462,9 +472,13 @@ def substitute_special_type(match, category, foreign_key_sql, context): category = match_type(match["type"]) if category in SPECIAL_TYPES: substitute_special_type(match, category, foreign_key_sql, context) - elif category in CORE_TYPE_ALIASES: - # Core type alias - substitute with native SQL type - match["type"] = CORE_TYPE_ALIASES[category] + elif category in CORE_TYPE_NAMES: + # Core DataJoint type - substitute with native SQL type if mapping exists + core_name = category.lower() + sql_type = CORE_TYPE_SQL.get(core_name) + if sql_type is not None: + match["type"] = sql_type + # else: type passes through as-is (json, date, datetime, char, varchar, enum) else: assert False, f"Unknown special type: {category}" @@ -510,13 +524,22 @@ def compile_attribute(line, in_key, foreign_key_sql, context): raise DataJointError('An attribute comment must not start with a colon in comment "{comment}"'.format(**match)) category = match_type(match["type"]) + if category in SPECIAL_TYPES: - match["comment"] = ":{type}:{comment}".format(**match) # insert custom type into comment + # Core types and AttributeTypes are recorded in comment for reconstruction + match["comment"] = ":{type}:{comment}".format(**match) substitute_special_type(match, category, foreign_key_sql, context) + elif category in NATIVE_TYPES: + # Non-standard native type - warn user + logger.warning( + f"Non-standard native type '{match['type']}' in attribute '{match['name']}'. " + "Consider using a core DataJoint type for better portability." + ) # Check for invalid default values on blob types (after type substitution) - final_category = match_type(match["type"]) - if final_category == "BLOB" and match["default"] not in {"DEFAULT NULL", "NOT NULL"}: + # Note: blob → longblob, so check for NATIVE_BLOB or longblob result + final_type = match["type"].lower() + if ("blob" in final_type) and match["default"] not in {"DEFAULT NULL", "NOT NULL"}: raise DataJointError("The default value for blob attributes can only be NULL in:\n{line}".format(line=line)) sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match) diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 07617004e..9750b84f3 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -8,7 +8,7 @@ from .attribute_adapter import get_adapter from .attribute_type import AttributeType from .declare import ( - CORE_TYPE_ALIASES, + CORE_TYPE_NAMES, SPECIAL_TYPES, TYPE_PATTERN, ) @@ -348,7 +348,7 @@ def _init_from_database(self): if category == "UUID": attr["uuid"] = True - elif category in CORE_TYPE_ALIASES: + elif category in CORE_TYPE_NAMES: # Core type alias - already resolved in DB pass From 746108a63072650b9fdd8b6df13e74acff12066b Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 20:55:51 +0000 Subject: [PATCH 29/42] Implement Phase 2: Content-Addressed Storage Add content-addressed storage with deduplication for the and AttributeTypes. New files: - content_registry.py: Content storage utilities - compute_content_hash(): SHA256 hashing - build_content_path(): Hierarchical path generation (_content/xx/yy/hash) - put_content(): Store with deduplication - get_content(): Retrieve with hash verification - content_exists(), delete_content(), get_content_size() New built-in AttributeTypes in attribute_type.py: - ContentType (): Content-addressed storage for raw bytes - dtype = "json" (stores metadata: hash, store, size) - Automatic deduplication via SHA256 hashing - XBlobType (): Serialized blobs with external storage - dtype = "" (composition with ContentType) - Combines djblob serialization with content-addressed storage Updated insert/fetch for type chain support: - table.py: Apply encoder chain from outermost to innermost - fetch.py: Apply decoder chain from innermost to outermost - Both pass store_name through the chain for external storage Example usage: data : # Raw bytes, deduplicated array : # Serialized objects, deduplicated Co-authored-by: dimitri-yatsenko --- src/datajoint/attribute_type.py | 169 ++++++++++++++++++++++++++ src/datajoint/content_registry.py | 193 ++++++++++++++++++++++++++++++ src/datajoint/fetch.py | 45 ++++--- src/datajoint/table.py | 19 ++- 4 files changed, 405 insertions(+), 21 deletions(-) create mode 100644 src/datajoint/content_registry.py diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index 97ca54646..2c06ccc83 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -570,6 +570,173 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: return blob.unpack(stored, squeeze=False) +class ContentType(AttributeType): + """ + Built-in type for content-addressed storage with deduplication. + + The ```` type stores data using content-addressed storage. Data is + identified by its SHA256 hash and stored in a hierarchical directory structure. + Duplicate content is automatically deduplicated - storing the same bytes twice + will only create one copy in storage. + + The database column stores JSON metadata including the content hash, store name, + and size. The actual content is stored in external storage. + + This type is primarily used as a building block for other types like ```` + and ````, but can also be used directly for raw binary content. + + Example: + @schema + class RawContent(dj.Manual): + definition = ''' + content_id : int + --- + data : # Content-addressed storage + ''' + + # Insert raw bytes + table.insert1({'content_id': 1, 'data': b'raw binary content'}) + + # Fetch returns the original bytes + data = (table & 'content_id=1').fetch1('data') + assert data == b'raw binary content' + + Storage Structure: + Content is stored at: ``_content/{hash[:2]}/{hash[2:4]}/{hash}`` + This hierarchical structure prevents too many files in a single directory. + + Note: + The store parameter is required for ```` unless a default store + is configured. Use ```` syntax to specify the store. + """ + + type_name = "content" + dtype = "json" + + def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: + """ + Store content and return metadata. + + Computes the SHA256 hash of the content and stores it using content-addressed + storage. If content with the same hash already exists, it is not re-uploaded + (deduplication). + + Args: + value: Raw bytes to store. + key: Primary key values (unused for content storage). + store_name: Store to use. If None, uses default store from config. + + Returns: + Metadata dict with keys: hash, store, size + + Raises: + TypeError: If value is not bytes. + """ + if not isinstance(value, bytes): + raise TypeError(f" type expects bytes, got {type(value).__name__}") + + from .content_registry import put_content + + return put_content(value, store_name=store_name) + + def decode(self, stored: dict, *, key: dict | None = None) -> bytes: + """ + Retrieve content by its hash. + + Args: + stored: Metadata dict with 'hash' and optionally 'store' keys. + key: Primary key values (unused for content retrieval). + + Returns: + The original bytes. + + Raises: + MissingExternalFile: If content is not found. + DataJointError: If hash verification fails. + """ + from .content_registry import get_content + + content_hash = stored["hash"] + store_name = stored.get("store") + return get_content(content_hash, store_name=store_name) + + def validate(self, value: Any) -> None: + """Validate that value is bytes.""" + if not isinstance(value, bytes): + raise TypeError(f" type expects bytes, got {type(value).__name__}") + + +class XBlobType(AttributeType): + """ + Built-in type for externally-stored serialized blobs with deduplication. + + The ```` type combines DataJoint's blob serialization with content-addressed + storage. Objects are serialized using the djblob format, then stored externally + using content-addressed storage for automatic deduplication. + + This type is ideal for large objects (NumPy arrays, pandas DataFrames, etc.) + that may be duplicated across multiple rows. + + Example: + @schema + class LargeArrays(dj.Manual): + definition = ''' + array_id : int + --- + data : # External serialized blob with deduplication + ''' + + # Insert NumPy array + import numpy as np + table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)}) + + # Fetch returns the original array + data = (table & 'array_id=1').fetch1('data') + + Note: + - For internal storage (in database), use ```` + - For external storage without serialization, use ```` + - The store parameter is required unless a default store is configured + """ + + type_name = "xblob" + dtype = "" # Composition: uses ContentType for storage + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """ + Serialize a Python object to bytes. + + The object is serialized using DataJoint's blob format. The resulting + bytes are then passed to the underlying ```` type for storage. + + Args: + value: Any serializable Python object. + key: Primary key values (unused). + store_name: Store parameter (passed through to content storage). + + Returns: + Serialized bytes (will be stored by ContentType). + """ + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """ + Deserialize bytes back to a Python object. + + Args: + stored: Serialized bytes retrieved from content storage. + key: Primary key values (unused). + + Returns: + The deserialized Python object. + """ + from . import blob + + return blob.unpack(stored, squeeze=False) + + def _register_builtin_types() -> None: """ Register DataJoint's built-in attribute types. @@ -577,6 +744,8 @@ def _register_builtin_types() -> None: Called automatically during module initialization. """ register_type(DJBlobType) + register_type(ContentType) + register_type(XBlobType) # Register built-in types when module is loaded diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py new file mode 100644 index 000000000..5ff98e917 --- /dev/null +++ b/src/datajoint/content_registry.py @@ -0,0 +1,193 @@ +""" +Content-addressed storage registry for DataJoint. + +This module provides content-addressed storage with deduplication for the +AttributeType. Content is identified by its SHA256 hash and stored in a hierarchical +directory structure: _content/{hash[:2]}/{hash[2:4]}/{hash} + +The ContentRegistry tracks stored content for garbage collection purposes. +""" + +import hashlib +import logging +from typing import Any + +from .errors import DataJointError +from .settings import config +from .storage import StorageBackend + +logger = logging.getLogger(__name__.split(".")[0]) + + +def compute_content_hash(data: bytes) -> str: + """ + Compute SHA256 hash of content. + + Args: + data: Content bytes + + Returns: + Hex-encoded SHA256 hash (64 characters) + """ + return hashlib.sha256(data).hexdigest() + + +def build_content_path(content_hash: str) -> str: + """ + Build the storage path for content-addressed storage. + + Content is stored in a hierarchical structure to avoid too many files + in a single directory: _content/{hash[:2]}/{hash[2:4]}/{hash} + + Args: + content_hash: SHA256 hex hash (64 characters) + + Returns: + Relative path within the store + """ + if len(content_hash) != 64: + raise DataJointError(f"Invalid content hash length: {len(content_hash)} (expected 64)") + return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + + +def get_store_backend(store_name: str | None = None) -> StorageBackend: + """ + Get a StorageBackend for content storage. + + Args: + store_name: Name of the store to use. If None, uses the default store. + + Returns: + StorageBackend instance + """ + if store_name is None: + # Use default store from object_storage settings + store_name = config.object_storage.default_store + if store_name is None: + raise DataJointError( + "No default store configured. Set object_storage.default_store " + "or specify a store name explicitly." + ) + + spec = config.get_object_store_spec(store_name) + return StorageBackend(spec) + + +def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: + """ + Store content using content-addressed storage. + + If the content already exists (same hash), it is not re-uploaded. + Returns metadata including the hash, store, and size. + + Args: + data: Content bytes to store + store_name: Name of the store. If None, uses default store. + + Returns: + Metadata dict with keys: hash, store, size + """ + content_hash = compute_content_hash(data) + path = build_content_path(content_hash) + + backend = get_store_backend(store_name) + + # Check if content already exists (deduplication) + if not backend.exists(path): + backend.put_buffer(data, path) + logger.debug(f"Stored new content: {content_hash[:16]}... ({len(data)} bytes)") + else: + logger.debug(f"Content already exists: {content_hash[:16]}...") + + return { + "hash": content_hash, + "store": store_name, + "size": len(data), + } + + +def get_content(content_hash: str, store_name: str | None = None) -> bytes: + """ + Retrieve content by its hash. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + Content bytes + + Raises: + MissingExternalFile: If content is not found + DataJointError: If hash verification fails + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + + data = backend.get_buffer(path) + + # Verify hash (optional but recommended for integrity) + actual_hash = compute_content_hash(data) + if actual_hash != content_hash: + raise DataJointError( + f"Content hash mismatch: expected {content_hash[:16]}..., " + f"got {actual_hash[:16]}..." + ) + + return data + + +def content_exists(content_hash: str, store_name: str | None = None) -> bool: + """ + Check if content exists in storage. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + True if content exists + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + return backend.exists(path) + + +def delete_content(content_hash: str, store_name: str | None = None) -> bool: + """ + Delete content from storage. + + WARNING: This should only be called after verifying no references exist. + Use garbage collection to safely remove unreferenced content. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + True if content was deleted, False if it didn't exist + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + + if backend.exists(path): + backend.remove(path) + logger.debug(f"Deleted content: {content_hash[:16]}...") + return True + return False + + +def get_content_size(content_hash: str, store_name: str | None = None) -> int: + """ + Get the size of stored content. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + Size in bytes + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + return backend.size(path) diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 000ab0bfd..d021a87d8 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -40,7 +40,10 @@ def _get(connection, attr, data, squeeze, download_path): - JSON types are parsed - UUID types are converted from bytes - Blob types return raw bytes (unless an adapter handles them) - - Adapters (AttributeTypes) handle all custom encoding/decoding + - Adapters (AttributeTypes) handle all custom encoding/decoding via type chains + + For composed types (e.g., using ), decoders are applied + in reverse order: innermost first, then outermost. :param connection: a dj.Connection object :param attr: attribute from the table's heading @@ -52,30 +55,36 @@ def _get(connection, attr, data, squeeze, download_path): if data is None: return None - # JSON type - parse and optionally decode via adapter + # Get the final storage type and type chain if adapter present + if attr.adapter: + from .attribute_type import resolve_dtype + + final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") + + # First, process the final dtype (what's stored in the database) + if final_dtype.lower() == "json": + data = json.loads(data) + elif final_dtype.lower() in ("longblob", "blob", "mediumblob", "tinyblob"): + pass # Blob data is already bytes + elif final_dtype.lower() == "binary(16)": + data = uuid_module.UUID(bytes=data) + + # Apply decoders in reverse order: innermost first, then outermost + for attr_type in reversed(type_chain): + data = attr_type.decode(data, key=None) + + return data + + # No adapter - handle native types if attr.json: - parsed = json.loads(data) - if attr.adapter: - return attr.adapter.decode(parsed, key=None) - return parsed + return json.loads(data) - # UUID type - convert bytes to UUID object if attr.uuid: - result = uuid_module.UUID(bytes=data) - if attr.adapter: - return attr.adapter.decode(result, key=None) - return result + return uuid_module.UUID(bytes=data) - # Blob type - return raw bytes or decode via adapter if attr.is_blob: - if attr.adapter: - return attr.adapter.decode(data, key=None) return data # raw bytes - # Other types with adapter - if attr.adapter: - return attr.adapter.decode(data, key=None) - # Native types - pass through unchanged return data diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 170e06089..009d475d2 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -925,7 +925,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): processing by mysql API. In the simplified type system: - - Adapters (AttributeTypes) handle all custom encoding + - Adapters (AttributeTypes) handle all custom encoding via type chains - UUID values are converted to bytes - JSON values are serialized - Blob values pass through as bytes @@ -940,10 +940,23 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): return None attr = self.heading[name] - # Apply adapter encoding first (if present) + # Apply adapter encoding with type chain support if attr.adapter: + from .attribute_type import resolve_dtype + attr.adapter.validate(value) - value = attr.adapter.encode(value, key=None) + + # Resolve full type chain + _, type_chain, resolved_store = resolve_dtype(f"<{attr.adapter.type_name}>", store_name=attr.store) + + # Apply encoders from outermost to innermost + for attr_type in type_chain: + # Pass store_name to encoders that support it + try: + value = attr_type.encode(value, key=None, store_name=resolved_store) + except TypeError: + # Encoder doesn't accept store_name parameter + value = attr_type.encode(value, key=None) # Handle NULL values if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))): From 328a59a1927d10553c36224ff1aeee2874d9f1b5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 21:01:37 +0000 Subject: [PATCH 30/42] Apply ruff-format to content_registry.py Co-authored-by: dimitri-yatsenko --- src/datajoint/content_registry.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py index 5ff98e917..01e5844cf 100644 --- a/src/datajoint/content_registry.py +++ b/src/datajoint/content_registry.py @@ -65,8 +65,7 @@ def get_store_backend(store_name: str | None = None) -> StorageBackend: store_name = config.object_storage.default_store if store_name is None: raise DataJointError( - "No default store configured. Set object_storage.default_store " - "or specify a store name explicitly." + "No default store configured. Set object_storage.default_store " "or specify a store name explicitly." ) spec = config.get_object_store_spec(store_name) @@ -129,10 +128,7 @@ def get_content(content_hash: str, store_name: str | None = None) -> bytes: # Verify hash (optional but recommended for integrity) actual_hash = compute_content_hash(data) if actual_hash != content_hash: - raise DataJointError( - f"Content hash mismatch: expected {content_hash[:16]}..., " - f"got {actual_hash[:16]}..." - ) + raise DataJointError(f"Content hash mismatch: expected {content_hash[:16]}..., " f"got {actual_hash[:16]}...") return data From bbbfbc38a84a78cb6f5356f4d0991c5c2d5e0ff1 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 21:24:04 +0000 Subject: [PATCH 31/42] Remove legacy compatibility shims: attribute_adapter.py, bypass_serialization Breaking changes: - Remove attribute_adapter.py entirely (hard deprecate) - Remove bypass_serialization flag from blob.py - blobs always serialize now - Remove unused 'database' field from Attribute in heading.py Import get_adapter from attribute_type instead of attribute_adapter. Co-authored-by: dimitri-yatsenko --- src/datajoint/attribute_adapter.py | 42 ---------------------- src/datajoint/blob.py | 10 ------ src/datajoint/declare.py | 2 +- src/datajoint/heading.py | 4 +-- tests/test_bypass_serialization.py | 57 ------------------------------ 5 files changed, 2 insertions(+), 113 deletions(-) delete mode 100644 src/datajoint/attribute_adapter.py delete mode 100644 tests/test_bypass_serialization.py diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py deleted file mode 100644 index c92618f9e..000000000 --- a/src/datajoint/attribute_adapter.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Attribute adapter module - compatibility shim. - -This module re-exports functions from attribute_type for backward compatibility -with code that imports from attribute_adapter. - -.. deprecated:: 0.15 - Import directly from :mod:`datajoint.attribute_type` instead. -""" - -from .attribute_type import ( - AttributeType, - get_type, - is_type_registered, - parse_type_spec, -) -from .errors import DataJointError - - -def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: - """ - Get an attribute type by name. - - Args: - context: Ignored (legacy parameter, kept for API compatibility). - adapter_name: The type name, with or without angle brackets. - May include store parameter (e.g., ""). - - Returns: - Tuple of (AttributeType instance, store_name or None). - - Raises: - DataJointError: If the type is not found. - """ - # Parse type name and optional store parameter - type_name, store_name = parse_type_spec(adapter_name) - - # Look up in the global type registry - if is_type_registered(type_name): - return get_type(type_name), store_name - - raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.") diff --git a/src/datajoint/blob.py b/src/datajoint/blob.py index 424d88779..15364bfa4 100644 --- a/src/datajoint/blob.py +++ b/src/datajoint/blob.py @@ -56,8 +56,6 @@ compression = {b"ZL123\0": zlib.decompress} -bypass_serialization = False # runtime setting to bypass blob (en|de)code - # runtime setting to read integers as 32-bit to read blobs created by the 32-bit # version of the mYm library for MATLAB use_32bit_dims = False @@ -507,17 +505,9 @@ def pack(self, obj, compress): def pack(obj, compress=True): - if bypass_serialization: - # provide a way to move blobs quickly without de/serialization - assert isinstance(obj, bytes) and obj.startswith((b"ZL123\0", b"mYm\0", b"dj0\0")) - return obj return Blob().pack(obj, compress=compress) def unpack(blob, squeeze=False): - if bypass_serialization: - # provide a way to move blobs quickly without de/serialization - assert isinstance(blob, bytes) and blob.startswith((b"ZL123\0", b"mYm\0", b"dj0\0")) - return blob if blob is not None: return Blob(squeeze=squeeze).unpack(blob) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index c08a5fd4c..68286de2c 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -9,7 +9,7 @@ import pyparsing as pp -from .attribute_adapter import get_adapter +from .attribute_type import get_adapter from .condition import translate_attribute from .errors import DataJointError from .settings import config diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 9750b84f3..3221522fd 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -5,7 +5,7 @@ import numpy as np -from .attribute_adapter import get_adapter +from .attribute_type import get_adapter from .attribute_type import AttributeType from .declare import ( CORE_TYPE_NAMES, @@ -65,7 +65,6 @@ def decode(self, stored, *, key=None): store=None, unsupported=False, attribute_expression=None, - database=None, dtype=object, ) @@ -282,7 +281,6 @@ def _init_from_database(self): for attr in attributes: attr.update( in_key=(attr["in_key"] == "PRI"), - database=database, nullable=attr["nullable"] == "YES", autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)), numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")), diff --git a/tests/test_bypass_serialization.py b/tests/test_bypass_serialization.py deleted file mode 100644 index da7f0b0e3..000000000 --- a/tests/test_bypass_serialization.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_array_equal - -import datajoint as dj - -test_blob = np.array([1, 2, 3]) - - -class Input(dj.Lookup): - definition = """ - id: int - --- - data: blob - """ - contents = [(0, test_blob)] - - -class Output(dj.Manual): - definition = """ - id: int - --- - data: blob - """ - - -@pytest.fixture -def schema_in(connection_test, prefix): - schema = dj.Schema( - prefix + "_test_bypass_serialization_in", - context=dict(Input=Input), - connection=connection_test, - ) - schema(Input) - yield schema - schema.drop() - - -@pytest.fixture -def schema_out(connection_test, prefix): - schema = dj.Schema( - prefix + "_test_blob_bypass_serialization_out", - context=dict(Output=Output), - connection=connection_test, - ) - schema(Output) - yield schema - schema.drop() - - -def test_bypass_serialization(schema_in, schema_out): - dj.blob.bypass_serialization = True - contents = Input.fetch(as_dict=True) - assert isinstance(contents[0]["data"], bytes) - Output.insert(contents) - dj.blob.bypass_serialization = False - assert_array_equal(Input.fetch1("data"), Output.fetch1("data")) From 3c4608fc94cc9fd33f4fcc5c8601b5a4a725c301 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 22:12:19 +0000 Subject: [PATCH 32/42] Update implementation plan to reflect actual implementation - Document function-based content storage (not registry class) - Add implementation status table - Explain design decision: functions vs database table - Update Phase 5 GC design for scanning approach - Document removed/deprecated items Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 595 ++++++------------ 1 file changed, 194 insertions(+), 401 deletions(-) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index 13d2e45d3..a425837eb 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -7,187 +7,150 @@ This plan describes the implementation of a three-layer type architecture for Da 1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes) 2. Implement content-addressed storage with deduplication 3. Provide composable, user-friendly types (``, ``, ``) -4. Enable project-wide garbage collection via `ContentRegistry` +4. Enable project-wide garbage collection 5. Maintain backward compatibility with existing schemas --- -## Phase 1: Core Type System Foundation +## Implementation Status -**Goal**: Establish the complete Layer 2 core type mappings and enhance the AttributeType infrastructure. +| Phase | Status | Notes | +|-------|--------|-------| +| Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution | +| Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table | +| Phase 3: User-Defined AttributeTypes | 🔲 Pending | XBlobType done, AttachType/FilepathType pending | +| Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | +| Phase 5: Garbage Collection | 🔲 Pending | | +| Phase 6: Migration Utilities | 🔲 Pending | | +| Phase 7: Documentation and Testing | 🔲 Pending | | -### 1.1 Expand Core Type Mappings - -**Files to modify:** -- `src/datajoint/declare.py` - -**Current state**: `SQL_TYPE_ALIASES` already maps some types (float32, int32, etc.) - -**Changes needed**: -1. Complete the type mappings as per spec: - ``` - Core Type -> MySQL Type - int8 -> TINYINT - uint8 -> TINYINT UNSIGNED - int16 -> SMALLINT - ... - json -> JSON - uuid -> BINARY(16) or CHAR(36) - decimal -> DECIMAL(p,s) - ``` - -2. Add PostgreSQL mappings for future support (can be placeholder initially) - -**Dependencies**: None - -### 1.2 Enhance AttributeType with Store Parameter Support +--- -**Files to modify:** -- `src/datajoint/attribute_type.py` +## Phase 1: Core Type System Foundation ✅ -**Current state**: Types don't support `@store` parameter syntax +**Status**: Complete -**Changes needed**: -1. Add `store_name` property to `AttributeType` -2. Modify `resolve_dtype()` to handle `` syntax -3. Add `get_type_with_store(name_with_store)` helper that parses `xblob@cold` format +### Implemented in `src/datajoint/declare.py`: ```python -def parse_type_spec(spec: str) -> tuple[str, str | None]: - """Parse '' or '' into (type_name, store_name).""" - spec = spec.strip("<>") - if "@" in spec: - type_name, store_name = spec.split("@", 1) - return type_name, store_name - return spec, None +CORE_TYPES = { + # Numeric types (aliased to native SQL) + "float32": (r"float32$", "float"), + "float64": (r"float64$", "double"), + "int64": (r"int64$", "bigint"), + "uint64": (r"uint64$", "bigint unsigned"), + "int32": (r"int32$", "int"), + "uint32": (r"uint32$", "int unsigned"), + "int16": (r"int16$", "smallint"), + "uint16": (r"uint16$", "smallint unsigned"), + "int8": (r"int8$", "tinyint"), + "uint8": (r"uint8$", "tinyint unsigned"), + "bool": (r"bool$", "tinyint"), + # UUID (stored as binary) + "uuid": (r"uuid$", "binary(16)"), + # JSON + "json": (r"json$", None), + # Binary (blob maps to longblob) + "blob": (r"blob$", "longblob"), + # Temporal + "date": (r"date$", None), + "datetime": (r"datetime$", None), + # String types (with parameters) + "char": (r"char\s*\(\d+\)$", None), + "varchar": (r"varchar\s*\(\d+\)$", None), + # Enumeration + "enum": (r"enum\s*\(.+\)$", None), +} ``` -**Dependencies**: None +### Key changes: +- Removed `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` +- Core types are recorded in field comments with `:type:` syntax +- Non-standard native types pass through with warning +- `parse_type_spec()` handles `` syntax +- `resolve_dtype()` returns `(final_dtype, type_chain, store_name)` tuple -### 1.3 Update Heading and Declaration Parsing +--- -**Files to modify:** -- `src/datajoint/heading.py` -- `src/datajoint/declare.py` +## Phase 2: Content-Addressed Storage ✅ -**Changes needed**: -1. Update `TYPE_PATTERN` to recognize new AttributeType patterns -2. Store `store_name` in attribute metadata for parameterized types -3. Update `compile_attribute()` to handle `` syntax -4. Update `_init_from_database()` to reconstruct store information +**Status**: Complete (simplified design) -**Dependencies**: Phase 1.2 +### Design Decision: Functions vs Class ---- +The original plan proposed a `ContentRegistry` class with a database table. We implemented a simpler, stateless approach using functions in `content_registry.py`: -## Phase 2: Content-Addressed Storage Implementation +**Why functions instead of a registry table:** +1. **Simpler** - No additional database table to manage +2. **Decoupled** - Content storage is independent of any schema +3. **GC by scanning** - Garbage collection scans tables for references rather than maintaining reference counts +4. **Less state** - No synchronization issues between registry and actual storage -**Goal**: Implement the `` type with content-addressed storage and deduplication. +### Implemented in `src/datajoint/content_registry.py`: -### 2.1 Create ContentRegistry Table +```python +def compute_content_hash(data: bytes) -> str: + """Compute SHA256 hash of content.""" + return hashlib.sha256(data).hexdigest() -**New file to create:** -- `src/datajoint/content_registry.py` +def build_content_path(content_hash: str) -> str: + """Build path: _content/{hash[:2]}/{hash[2:4]}/{hash}""" + return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" -**Implementation**: -```python -class ContentRegistry: - """ - Project-level content registry for content-addressed storage. - Stored in a designated database (e.g., `{project}_content`). - """ - definition = """ - # Content-addressed object registry (project-wide) - content_hash : char(64) # SHA256 hex - --- - store : varchar(64) # Store name - size : bigint unsigned # Size in bytes - created : timestamp DEFAULT CURRENT_TIMESTAMP - """ -``` +def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: + """Store content with deduplication. Returns {hash, store, size}.""" + ... -Key features: -- Auto-create the registry database on first use -- Methods: `insert_content()`, `get_content()`, `increment_ref()`, `decrement_ref()` -- Thread-safe reference counting (if needed) +def get_content(content_hash: str, store_name: str | None = None) -> bytes: + """Retrieve content by hash with verification.""" + ... -**Dependencies**: None +def content_exists(content_hash: str, store_name: str | None = None) -> bool: + """Check if content exists.""" + ... -### 2.2 Implement ContentType AttributeType +def delete_content(content_hash: str, store_name: str | None = None) -> bool: + """Delete content (use with caution - verify no references first).""" + ... +``` -**Files to modify:** -- `src/datajoint/attribute_type.py` +### Implemented AttributeTypes in `src/datajoint/attribute_type.py`: -**New built-in type**: ```python class ContentType(AttributeType): - """Built-in AttributeType for content-addressed storage.""" + """Content-addressed storage. Stores bytes, returns JSON metadata.""" type_name = "content" dtype = "json" - def encode(self, data: bytes, *, key=None, store_name=None) -> dict: - """Store content, return metadata as JSON.""" - content_hash = hashlib.sha256(data).hexdigest() - path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - # Store if not exists, register in ContentRegistry - ... - return {"hash": content_hash, "store": store_name, "size": len(data)} + def encode(self, value: bytes, *, key=None, store_name=None) -> dict: + return put_content(value, store_name=store_name) def decode(self, stored: dict, *, key=None) -> bytes: - """Retrieve content by hash.""" - ... -``` - -**Dependencies**: Phase 2.1 - -### 2.3 Implement Content Storage Backend Methods - -**Files to modify:** -- `src/datajoint/storage.py` - -**Changes needed**: -1. Add `put_content()` method with deduplication -2. Add `get_content()` method with hash verification -3. Add `compute_content_hash()` utility -4. Add content path generation: `_content/{hash[:2]}/{hash[2:4]}/{hash}` + return get_content(stored["hash"], store_name=stored.get("store")) -**Dependencies**: None ---- - -## Phase 3: User-Defined AttributeTypes - -**Goal**: Implement the standard user-facing types that compose with `` and ``. - -### 3.1 Implement XBlobType (External Blob) - -**Files to modify:** -- `src/datajoint/attribute_type.py` - -```python -@register_type class XBlobType(AttributeType): """External serialized blob using content-addressed storage.""" type_name = "xblob" - dtype = "" # Composition: uses ContentType + dtype = "" # Composition - def encode(self, value, *, key=None) -> bytes: - from . import blob + def encode(self, value, *, key=None, store_name=None) -> bytes: return blob.pack(value, compress=True) - def decode(self, stored, *, key=None) -> Any: - from . import blob - return blob.unpack(stored) + def decode(self, stored: bytes, *, key=None) -> Any: + return blob.unpack(stored, squeeze=False) ``` -**Key behavior**: Serializes to djblob format, stores via content-addressed storage +--- -**Dependencies**: Phase 2.2 +## Phase 3: User-Defined AttributeTypes -### 3.2 Implement AttachType and XAttachType +**Status**: Partially complete -**Files to modify:** -- `src/datajoint/attribute_type.py` +### 3.1 XBlobType ✅ +Implemented as shown above. Composes with ``. + +### 3.2 AttachType and XAttachType 🔲 ```python @register_type @@ -210,22 +173,10 @@ class XAttachType(AttributeType): """External file attachment using content-addressed storage.""" type_name = "xattach" dtype = "" - - def encode(self, filepath, *, key=None) -> bytes: - path = Path(filepath) - return path.name.encode() + b"\0" + path.read_bytes() - - def decode(self, stored, *, key=None) -> str: - # Same as AttachType.decode() - ... + # Similar to AttachType but composes with content storage ``` -**Dependencies**: Phase 2.2 - -### 3.3 Implement FilepathType - -**Files to modify:** -- `src/datajoint/attribute_type.py` +### 3.3 FilepathType 🔲 ```python @register_type @@ -234,337 +185,179 @@ class FilepathType(AttributeType): type_name = "filepath" dtype = "json" - def encode(self, relative_path: str, *, key=None, store_name=None, - compute_checksum: bool = False) -> dict: + def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: """Register reference to file in store.""" - store = get_store(store_name) # Required for filepath - metadata = {'path': relative_path, 'store': store_name} - if compute_checksum: - # Compute checksum and size - ... - return metadata + return {'path': relative_path, 'store': store_name} def decode(self, stored: dict, *, key=None) -> ObjectRef: """Return ObjectRef for lazy access.""" - return ObjectRef( - store=get_store(stored['store']), - path=stored['path'], - checksum=stored.get('checksum') - ) + return ObjectRef(store=stored['store'], path=stored['path']) ``` -**Key difference from legacy**: Returns `ObjectRef` instead of copying to local stage - -**Dependencies**: Existing `ObjectRef` and `StorageBackend` - --- -## Phase 4: Insert and Fetch Integration +## Phase 4: Insert and Fetch Integration ✅ -**Goal**: Update the data path to handle the new type system seamlessly. +**Status**: Complete -### 4.1 Update Insert Processing - -**Files to modify:** -- `src/datajoint/table.py` - -**Changes needed in `__make_placeholder()`**: -1. Handle type composition (resolve full type chain) -2. Pass `store_name` to `encode()` when applicable -3. Handle `` type's special behavior -4. Process `` with store parameter +### Updated in `src/datajoint/table.py`: ```python def __make_placeholder(self, name, value, ...): - attr = self.heading[name] if attr.adapter: - # Resolve type chain and pass store_name - final_dtype, type_chain = resolve_dtype(attr.adapter.dtype) - store_name = attr.store - - # Apply type chain: outer -> inner + from .attribute_type import resolve_dtype + attr.adapter.validate(value) + _, type_chain, resolved_store = resolve_dtype( + f"<{attr.adapter.type_name}>", store_name=attr.store + ) + # Apply type chain: outermost → innermost for attr_type in type_chain: - value = attr_type.encode(value, key=key, store_name=store_name) - - # Continue with final_dtype processing - ... + try: + value = attr_type.encode(value, key=None, store_name=resolved_store) + except TypeError: + value = attr_type.encode(value, key=None) ``` -**Dependencies**: Phases 1-3 - -### 4.2 Update Fetch Processing - -**Files to modify:** -- `src/datajoint/fetch.py` - -**Changes needed in `_get()`**: -1. Handle `` type: retrieve from content store -2. Handle type composition: apply decoders in reverse order -3. Handle ``: return `ObjectRef` instead of downloading +### Updated in `src/datajoint/fetch.py`: ```python def _get(connection, attr, data, squeeze, download_path): if attr.adapter: - final_dtype, type_chain = resolve_dtype(attr.adapter.dtype) + from .attribute_type import resolve_dtype + final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") - # Process based on final_dtype - if final_dtype == "json": + # Parse JSON if final storage is JSON + if final_dtype.lower() == "json": data = json.loads(data) - elif final_dtype == "longblob": - # Handle content retrieval if needed - ... - # Apply type chain in reverse: inner -> outer + # Apply type chain in reverse: innermost → outermost for attr_type in reversed(type_chain): - data = attr_type.decode(data, key=key) + data = attr_type.decode(data, key=None) return data ``` -**Dependencies**: Phases 1-3 - -### 4.3 Update Heading Attribute Properties - -**Files to modify:** -- `src/datajoint/heading.py` - -**Changes needed**: -1. Add `is_content` property for content-addressed attributes -2. Update property detection logic for new types -3. Store composed type information for fetch/insert - -**Dependencies**: Phase 1.3 - --- -## Phase 5: Garbage Collection +## Phase 5: Garbage Collection 🔲 -**Goal**: Implement project-wide garbage collection for content-addressed storage. +**Status**: Pending -### 5.1 Implement GC Scanner +### Design (updated for function-based approach): -**New file to create:** -- `src/datajoint/gc.py` +Since we don't have a registry table, GC works by scanning: ```python -def scan_content_references(project) -> set[tuple[str, str]]: +def scan_content_references(schemas: list) -> set[tuple[str, str]]: """ - Scan all schemas in project for content references. + Scan all schemas for content references. Returns: Set of (content_hash, store) tuples that are referenced """ referenced = set() - for schema in project.schemas: + for schema in schemas: for table in schema.tables: for attr in table.heading.attributes: - if attr.type in ('content', 'xblob', 'xattach'): - hashes = table.fetch(attr.name) - for h in hashes: - if isinstance(h, dict): - referenced.add((h['hash'], h.get('store'))) + if uses_content_storage(attr): + # Fetch all JSON metadata from this column + for row in table.fetch(attr.name): + if isinstance(row, dict) and 'hash' in row: + referenced.add((row['hash'], row.get('store'))) return referenced -def garbage_collect(project, dry_run=True) -> dict: +def list_stored_content(store_name: str) -> set[str]: + """List all content hashes in a store by scanning _content/ directory.""" + ... + +def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: """ Remove unreferenced content from storage. Returns: Stats: {'scanned': N, 'orphaned': M, 'deleted': K, 'bytes_freed': B} """ - ... -``` - -**Dependencies**: Phase 2.1 - -### 5.2 Add GC CLI Commands - -**Files to modify:** -- CLI or management interface + referenced = scan_content_references(schemas) + stored = list_stored_content(store_name) + orphaned = stored - {h for h, s in referenced if s == store_name} -**New commands**: -- `dj gc scan` - Scan and report orphaned content -- `dj gc clean` - Remove orphaned content -- `dj gc status` - Show content registry status - -**Dependencies**: Phase 5.1 - ---- + if not dry_run: + for content_hash in orphaned: + delete_content(content_hash, store_name) -## Phase 6: Migration Utilities - -**Goal**: Provide tools to migrate existing schemas to the new type system. - -### 6.1 Enhance Migration Module - -**Files to modify:** -- `src/datajoint/migrate.py` - -**New functions**: - -```python -def analyze_external_stores(schema) -> list[dict]: - """Analyze legacy ~external_* tables for migration.""" - ... - -def migrate_external_to_content(schema, store_name, dry_run=True) -> dict: - """ - Migrate legacy ~external_{store} to new ContentRegistry. - - Steps: - 1. Read entries from ~external_{store} - 2. For each entry: fetch content, compute SHA256 - 3. Copy to _content/{hash}/ if not exists - 4. Update referencing tables (UUID -> hash JSON) - 5. Register in ContentRegistry - """ - ... - -def migrate_blob_to_djblob(schema, dry_run=True) -> dict: - """Update implicit blob columns to use .""" - ... - -def migrate_filepath_to_new(schema, dry_run=True) -> dict: - """ - Migrate legacy filepath@store to new . - - Changes: - - UUID column -> JSON column - - Copy-based access -> ObjectRef-based access - """ - ... + return {'orphaned': len(orphaned), ...} ``` -### 6.2 Create Migration CLI - -**New commands**: -- `dj migrate analyze ` - Analyze migration needs -- `dj migrate external ` - Migrate external store -- `dj migrate blobs ` - Migrate blob columns -- `dj migrate status ` - Show migration status - -**Dependencies**: Phase 6.1 - --- -## Phase 7: Documentation and Testing +## Phase 6: Migration Utilities 🔲 -### 7.1 Unit Tests +**Status**: Pending -**New test files:** -- `tests/test_content_type.py` - Content-addressed storage tests -- `tests/test_xblob.py` - XBlob type tests -- `tests/test_attach_types.py` - Attachment type tests -- `tests/test_filepath_new.py` - New filepath tests -- `tests/test_gc.py` - Garbage collection tests -- `tests/test_migration.py` - Migration utility tests +### Key migrations needed: +1. Legacy `~external_{store}` tables → content-addressed storage +2. UUID-based external references → hash-based JSON metadata +3. Legacy `filepath@store` → new `` with ObjectRef -**Existing test files to update:** -- `tests/test_attribute_type.py` - Add new type tests -- `tests/test_object.py` - Verify object type unchanged - -### 7.2 Integration Tests +--- -**Test scenarios**: -1. Insert/fetch roundtrip for all new types -2. Type composition (xblob using content) -3. Multi-schema content deduplication -4. GC with cross-schema references -5. Migration from legacy external stores -6. Backward compatibility with existing schemas +## Phase 7: Documentation and Testing 🔲 -### 7.3 Documentation +**Status**: Pending -**Files to update:** -- `docs/src/design/tables/storage-types-spec.md` - Already exists -- Create user guide for new types -- Create migration guide -- Update API reference +### Test files to create: +- `tests/test_content_storage.py` - Content-addressed storage functions +- `tests/test_xblob.py` - XBlobType roundtrip +- `tests/test_type_composition.py` - Type chain encoding/decoding +- `tests/test_gc.py` - Garbage collection --- -## Implementation Order and Dependencies +## Critical Files Summary -``` -Phase 1: Core Type System Foundation -├── 1.1 Expand Core Type Mappings (no deps) -├── 1.2 Enhance AttributeType with Store Parameter (no deps) -└── 1.3 Update Heading and Declaration Parsing (depends on 1.2) - -Phase 2: Content-Addressed Storage -├── 2.1 Create ContentRegistry Table (no deps) -├── 2.2 Implement ContentType (depends on 2.1) -└── 2.3 Content Storage Backend Methods (no deps) - -Phase 3: User-Defined AttributeTypes (depends on Phase 2) -├── 3.1 Implement XBlobType (depends on 2.2) -├── 3.2 Implement AttachType and XAttachType (depends on 2.2) -└── 3.3 Implement FilepathType (no deps) - -Phase 4: Insert and Fetch Integration (depends on Phases 1-3) -├── 4.1 Update Insert Processing -├── 4.2 Update Fetch Processing -└── 4.3 Update Heading Attribute Properties - -Phase 5: Garbage Collection (depends on Phase 2) -├── 5.1 Implement GC Scanner -└── 5.2 Add GC CLI Commands - -Phase 6: Migration Utilities (depends on Phases 2-4) -├── 6.1 Enhance Migration Module -└── 6.2 Create Migration CLI - -Phase 7: Documentation and Testing (ongoing) -``` +| File | Status | Changes | +|------|--------|---------| +| `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation | +| `src/datajoint/heading.py` | ✅ | Simplified attribute properties | +| `src/datajoint/attribute_type.py` | ✅ | ContentType, XBlobType, type chain resolution | +| `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) | +| `src/datajoint/table.py` | ✅ | Type chain encoding on insert | +| `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | +| `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | +| `src/datajoint/gc.py` | 🔲 | Garbage collection (to be created) | +| `src/datajoint/migrate.py` | 🔲 | Migration utilities | --- -## Critical Files Summary +## Removed/Deprecated -| File | Changes | -|------|---------| -| `src/datajoint/attribute_type.py` | All new AttributeTypes: `ContentType`, `XBlobType`, `AttachType`, `XAttachType`, `FilepathType` | -| `src/datajoint/declare.py` | Type pattern parsing, SQL generation, `` syntax | -| `src/datajoint/heading.py` | Attribute metadata, composed type information | -| `src/datajoint/table.py` | Insert logic with type composition | -| `src/datajoint/fetch.py` | Fetch logic with type chain decoding | -| `src/datajoint/content_registry.py` | **New**: ContentRegistry table and methods | -| `src/datajoint/gc.py` | **New**: Garbage collection scanner | -| `src/datajoint/migrate.py` | Migration utilities | +- `src/datajoint/attribute_adapter.py` - Deleted (hard deprecated) +- `bypass_serialization` flag in `blob.py` - Removed +- `database` field in Attribute - Removed (unused) +- `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` - Removed +- `is_attachment`, `is_filepath`, `is_object`, `is_external` flags - Removed --- -## Risk Mitigation +## Architecture Summary -### Backward Compatibility -1. All existing types (`longblob`, `blob@store`, `attach@store`, `filepath@store`) continue to work -2. Legacy `~external_*` tables remain functional during transition -3. Implicit blob serialization preserved for existing schemas -4. Migration is opt-in and reversible +``` +Layer 3: AttributeTypes (user-facing) + , , , , + ↓ encode() / ↑ decode() -### Performance Considerations -1. Content hashing uses SHA256 (fast, widely supported) -2. Deduplication reduces storage costs -3. Lazy ObjectRef prevents unnecessary I/O -4. GC runs on-demand, not automatically +Layer 2: Core DataJoint Types + float32, int64, uuid, json, blob, varchar(n), etc. + ↓ SQL mapping -### Error Handling -1. Content hash verification on fetch (optional) -2. Graceful handling of missing content -3. Transaction safety for multi-table operations -4. Clear error messages for misconfiguration +Layer 1: Native Database Types + FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc. +``` ---- +**Type Composition Example:** +``` + → json (in DB) -## Estimated Effort - -| Phase | Estimated Days | Risk | -|-------|----------------|------| -| Phase 1 | 3-4 days | Low | -| Phase 2 | 4-5 days | Medium | -| Phase 3 | 3-4 days | Low | -| Phase 4 | 4-5 days | Medium | -| Phase 5 | 2-3 days | Low | -| Phase 6 | 3-4 days | Medium | -| Phase 7 | 5-7 days | Low | -| **Total** | **24-32 days** | | +Insert: Python object → blob.pack() → put_content() → JSON metadata +Fetch: JSON metadata → get_content() → blob.unpack() → Python object +``` From 70fb567afdb59fc013e67c2f0cc09323ffec1da2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 22:19:51 +0000 Subject: [PATCH 33/42] Move built-in AttributeTypes to separate builtin_types.py module - Create builtin_types.py with DJBlobType, ContentType, XBlobType - Types serve as examples for users creating custom types - Module docstring includes example of defining a custom GraphType - Add get_adapter() function to attribute_type.py for compatibility - Auto-register built-in types via import at module load Co-authored-by: dimitri-yatsenko --- src/datajoint/attribute_type.py | 297 +++----------------------------- src/datajoint/builtin_types.py | 239 +++++++++++++++++++++++++ 2 files changed, 260 insertions(+), 276 deletions(-) create mode 100644 src/datajoint/builtin_types.py diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py index 2c06ccc83..37fae88ca 100644 --- a/src/datajoint/attribute_type.py +++ b/src/datajoint/attribute_type.py @@ -463,290 +463,35 @@ def resolve_dtype( return dtype, chain, store_name -# ============================================================================= -# Built-in Attribute Types -# ============================================================================= - - -class DJBlobType(AttributeType): - """ - Built-in type for DataJoint's native serialization format. - - This type handles serialization of arbitrary Python objects (including NumPy arrays, - dictionaries, lists, etc.) using DataJoint's binary blob format. The format includes: - - - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) - - Optional compression (zlib) - - Support for NumPy arrays, datetime objects, UUIDs, and nested structures - - The ```` type is the explicit way to specify DataJoint's serialization. - It stores data in a MySQL ``LONGBLOB`` column. - - Example: - @schema - class ProcessedData(dj.Manual): - definition = ''' - data_id : int - --- - results : # Serialized Python objects - raw_bytes : longblob # Raw bytes (no serialization) - ''' - - Note: - Plain ``longblob`` columns store and return raw bytes without serialization. - Use ```` when you need automatic serialization of Python objects. - Existing schemas using implicit blob serialization should migrate to ```` - using ``dj.migrate.migrate_blob_columns()``. - """ - - type_name = "djblob" - dtype = "longblob" - - def encode(self, value: Any, *, key: dict | None = None) -> bytes: - """ - Serialize a Python object to DataJoint's blob format. - - Args: - value: Any serializable Python object (dict, list, numpy array, etc.) - key: Primary key values (unused for blob serialization). - - Returns: - Serialized bytes with protocol header and optional compression. - """ - from . import blob - - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key: dict | None = None) -> Any: - """ - Deserialize DataJoint blob format back to a Python object. - - Args: - stored: Serialized blob bytes. - key: Primary key values (unused for blob serialization). - - Returns: - The deserialized Python object. - """ - from . import blob - - return blob.unpack(stored, squeeze=False) - - -class DJBlobExternalType(AttributeType): - """ - Built-in type for externally-stored DataJoint blobs. - - Similar to ```` but stores data in external blob storage instead - of inline in the database. Useful for large objects. - - The store name is specified when defining the column type. - - Example: - @schema - class LargeData(dj.Manual): - definition = ''' - data_id : int - --- - large_array : blob@mystore # External storage with auto-serialization - ''' +def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: """ + Get an attribute type by name. - # Note: This type isn't directly usable via syntax - # It's used internally when blob@store syntax is detected - type_name = "djblob_external" - dtype = "blob@store" # Placeholder - actual store is determined at declaration time - - def encode(self, value: Any, *, key: dict | None = None) -> bytes: - """Serialize a Python object to DataJoint's blob format.""" - from . import blob - - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key: dict | None = None) -> Any: - """Deserialize DataJoint blob format back to a Python object.""" - from . import blob - - return blob.unpack(stored, squeeze=False) - - -class ContentType(AttributeType): - """ - Built-in type for content-addressed storage with deduplication. - - The ```` type stores data using content-addressed storage. Data is - identified by its SHA256 hash and stored in a hierarchical directory structure. - Duplicate content is automatically deduplicated - storing the same bytes twice - will only create one copy in storage. - - The database column stores JSON metadata including the content hash, store name, - and size. The actual content is stored in external storage. - - This type is primarily used as a building block for other types like ```` - and ````, but can also be used directly for raw binary content. - - Example: - @schema - class RawContent(dj.Manual): - definition = ''' - content_id : int - --- - data : # Content-addressed storage - ''' - - # Insert raw bytes - table.insert1({'content_id': 1, 'data': b'raw binary content'}) - - # Fetch returns the original bytes - data = (table & 'content_id=1').fetch1('data') - assert data == b'raw binary content' - - Storage Structure: - Content is stored at: ``_content/{hash[:2]}/{hash[2:4]}/{hash}`` - This hierarchical structure prevents too many files in a single directory. - - Note: - The store parameter is required for ```` unless a default store - is configured. Use ```` syntax to specify the store. - """ + This is a compatibility function used by heading and declare modules. - type_name = "content" - dtype = "json" - - def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: - """ - Store content and return metadata. - - Computes the SHA256 hash of the content and stores it using content-addressed - storage. If content with the same hash already exists, it is not re-uploaded - (deduplication). - - Args: - value: Raw bytes to store. - key: Primary key values (unused for content storage). - store_name: Store to use. If None, uses default store from config. - - Returns: - Metadata dict with keys: hash, store, size - - Raises: - TypeError: If value is not bytes. - """ - if not isinstance(value, bytes): - raise TypeError(f" type expects bytes, got {type(value).__name__}") - - from .content_registry import put_content - - return put_content(value, store_name=store_name) - - def decode(self, stored: dict, *, key: dict | None = None) -> bytes: - """ - Retrieve content by its hash. - - Args: - stored: Metadata dict with 'hash' and optionally 'store' keys. - key: Primary key values (unused for content retrieval). - - Returns: - The original bytes. - - Raises: - MissingExternalFile: If content is not found. - DataJointError: If hash verification fails. - """ - from .content_registry import get_content - - content_hash = stored["hash"] - store_name = stored.get("store") - return get_content(content_hash, store_name=store_name) - - def validate(self, value: Any) -> None: - """Validate that value is bytes.""" - if not isinstance(value, bytes): - raise TypeError(f" type expects bytes, got {type(value).__name__}") - - -class XBlobType(AttributeType): - """ - Built-in type for externally-stored serialized blobs with deduplication. - - The ```` type combines DataJoint's blob serialization with content-addressed - storage. Objects are serialized using the djblob format, then stored externally - using content-addressed storage for automatic deduplication. - - This type is ideal for large objects (NumPy arrays, pandas DataFrames, etc.) - that may be duplicated across multiple rows. - - Example: - @schema - class LargeArrays(dj.Manual): - definition = ''' - array_id : int - --- - data : # External serialized blob with deduplication - ''' - - # Insert NumPy array - import numpy as np - table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)}) + Args: + context: Ignored (legacy parameter, kept for API compatibility). + adapter_name: The type name, with or without angle brackets. + May include store parameter (e.g., ""). - # Fetch returns the original array - data = (table & 'array_id=1').fetch1('data') + Returns: + Tuple of (AttributeType instance, store_name or None). - Note: - - For internal storage (in database), use ```` - - For external storage without serialization, use ```` - - The store parameter is required unless a default store is configured + Raises: + DataJointError: If the type is not found. """ + type_name, store_name = parse_type_spec(adapter_name) - type_name = "xblob" - dtype = "" # Composition: uses ContentType for storage - - def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: - """ - Serialize a Python object to bytes. - - The object is serialized using DataJoint's blob format. The resulting - bytes are then passed to the underlying ```` type for storage. - - Args: - value: Any serializable Python object. - key: Primary key values (unused). - store_name: Store parameter (passed through to content storage). - - Returns: - Serialized bytes (will be stored by ContentType). - """ - from . import blob - - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key: dict | None = None) -> Any: - """ - Deserialize bytes back to a Python object. - - Args: - stored: Serialized bytes retrieved from content storage. - key: Primary key values (unused). + if is_type_registered(type_name): + return get_type(type_name), store_name - Returns: - The deserialized Python object. - """ - from . import blob + raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.") - return blob.unpack(stored, squeeze=False) - - -def _register_builtin_types() -> None: - """ - Register DataJoint's built-in attribute types. - - Called automatically during module initialization. - """ - register_type(DJBlobType) - register_type(ContentType) - register_type(XBlobType) +# ============================================================================= +# Auto-register built-in types +# ============================================================================= -# Register built-in types when module is loaded -_register_builtin_types() +# Import builtin_types module to register built-in types (DJBlobType, ContentType, etc.) +# This import has a side effect: it registers the types via @register_type decorators +from . import builtin_types as _builtin_types # noqa: F401, E402 diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py new file mode 100644 index 000000000..303b84945 --- /dev/null +++ b/src/datajoint/builtin_types.py @@ -0,0 +1,239 @@ +""" +Built-in DataJoint attribute types. + +This module defines the standard AttributeTypes that ship with DataJoint. +These serve as both useful built-in types and as examples for users who +want to create their own custom types. + +Built-in Types: + - ````: Serialize Python objects to DataJoint's blob format (internal storage) + - ````: Content-addressed storage with SHA256 deduplication + - ````: External serialized blobs using content-addressed storage + +Example - Creating a Custom Type: + Here's how to define your own AttributeType, modeled after the built-in types:: + + import datajoint as dj + import networkx as nx + + @dj.register_type + class GraphType(dj.AttributeType): + '''Store NetworkX graphs as edge lists.''' + + type_name = "graph" # Use as in definitions + dtype = "" # Compose with djblob for serialization + + def encode(self, graph, *, key=None, store_name=None): + # Convert graph to a serializable format + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + # Reconstruct graph from stored format + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + + def validate(self, value): + if not isinstance(value, nx.Graph): + raise TypeError(f"Expected nx.Graph, got {type(value).__name__}") + + # Now use in table definitions: + @schema + class Networks(dj.Manual): + definition = ''' + network_id : int + --- + topology : + ''' +""" + +from __future__ import annotations + +from typing import Any + +from .attribute_type import AttributeType, register_type + + +# ============================================================================= +# DJBlob Types - DataJoint's native serialization +# ============================================================================= + + +@register_type +class DJBlobType(AttributeType): + """ + Serialize Python objects using DataJoint's blob format. + + The ```` type handles serialization of arbitrary Python objects + including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs. + Data is stored in a MySQL ``LONGBLOB`` column. + + Format Features: + - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) + - Optional zlib compression for data > 1KB + - Support for nested structures + + Example:: + + @schema + class ProcessedData(dj.Manual): + definition = ''' + data_id : int + --- + results : # Serialized Python objects + ''' + + # Insert any serializable object + table.insert1({'data_id': 1, 'results': {'scores': [0.9, 0.8], 'labels': ['a', 'b']}}) + + Note: + Plain ``longblob`` columns store raw bytes without serialization. + Use ```` when you need automatic serialization. + """ + + type_name = "djblob" + dtype = "longblob" + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """Serialize a Python object to DataJoint's blob format.""" + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """Deserialize blob bytes back to a Python object.""" + from . import blob + + return blob.unpack(stored, squeeze=False) + + +# ============================================================================= +# Content-Addressed Storage Types +# ============================================================================= + + +@register_type +class ContentType(AttributeType): + """ + Content-addressed storage with SHA256 deduplication. + + The ```` type stores raw bytes using content-addressed storage. + Data is identified by its SHA256 hash and stored in a hierarchical directory: + ``_content/{hash[:2]}/{hash[2:4]}/{hash}`` + + The database column stores JSON metadata: ``{hash, store, size}``. + Duplicate content is automatically deduplicated. + + Example:: + + @schema + class RawContent(dj.Manual): + definition = ''' + content_id : int + --- + data : + ''' + + # Insert raw bytes + table.insert1({'content_id': 1, 'data': b'raw binary content'}) + + Note: + This type accepts only ``bytes``. For Python objects, use ````. + A store must be specified (e.g., ````) unless a default + store is configured. + """ + + type_name = "content" + dtype = "json" + + def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: + """ + Store content and return metadata. + + Args: + value: Raw bytes to store. + key: Primary key values (unused). + store_name: Store to use. If None, uses default store. + + Returns: + Metadata dict: {hash, store, size} + """ + from .content_registry import put_content + + return put_content(value, store_name=store_name) + + def decode(self, stored: dict, *, key: dict | None = None) -> bytes: + """ + Retrieve content by hash. + + Args: + stored: Metadata dict with 'hash' and optionally 'store'. + key: Primary key values (unused). + + Returns: + Original bytes. + """ + from .content_registry import get_content + + return get_content(stored["hash"], store_name=stored.get("store")) + + def validate(self, value: Any) -> None: + """Validate that value is bytes.""" + if not isinstance(value, bytes): + raise TypeError(f" expects bytes, got {type(value).__name__}") + + +@register_type +class XBlobType(AttributeType): + """ + External serialized blobs with content-addressed storage. + + The ```` type combines DataJoint's blob serialization with + content-addressed storage. Objects are serialized, then stored externally + with automatic deduplication. + + This is ideal for large objects (NumPy arrays, DataFrames) that may be + duplicated across rows. + + Example:: + + @schema + class LargeArrays(dj.Manual): + definition = ''' + array_id : int + --- + data : + ''' + + import numpy as np + table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)}) + + Type Composition: + ```` composes with ````:: + + Insert: object → blob.pack() → put_content() → JSON metadata + Fetch: JSON → get_content() → blob.unpack() → object + + Note: + - For internal storage, use ```` + - For raw bytes without serialization, use ```` + """ + + type_name = "xblob" + dtype = "" # Composition: uses ContentType + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """Serialize object to bytes (passed to ContentType).""" + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """Deserialize bytes back to Python object.""" + from . import blob + + return blob.unpack(stored, squeeze=False) From ad09877dbf149a83aeff9403a2974bc8172cacd7 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 22:25:09 +0000 Subject: [PATCH 34/42] Implement ObjectType for path-addressed storage Add type for files and folders (Zarr, HDF5, etc.): - Path derived from primary key: {schema}/{table}/objects/{pk}/{field}_{token} - Supports bytes, files, and directories - Returns ObjectRef for lazy fsspec-based access - No deduplication (unlike ) Update implementation plan with Phase 2b documenting ObjectType. Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 70 ++++++- src/datajoint/builtin_types.py | 190 ++++++++++++++++++ 2 files changed, 257 insertions(+), 3 deletions(-) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index a425837eb..22845c4ca 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -18,7 +18,8 @@ This plan describes the implementation of a three-layer type architecture for Da |-------|--------|-------| | Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution | | Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table | -| Phase 3: User-Defined AttributeTypes | 🔲 Pending | XBlobType done, AttachType/FilepathType pending | +| Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders | +| Phase 3: User-Defined AttributeTypes | 🔲 Pending | AttachType/FilepathType pending | | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | | Phase 5: Garbage Collection | 🔲 Pending | | | Phase 6: Migration Utilities | 🔲 Pending | | @@ -143,6 +144,58 @@ class XBlobType(AttributeType): --- +## Phase 2b: Path-Addressed Storage (ObjectType) ✅ + +**Status**: Complete + +### Design: Path vs Content Addressing + +| Aspect | `` | `` | +|--------|-------------|------------| +| Addressing | Content-hash (SHA256) | Path (from primary key) | +| Path Format | `_content/{hash[:2]}/{hash[2:4]}/{hash}` | `{schema}/{table}/objects/{pk}/{field}_{token}.ext` | +| Deduplication | Yes (same content = same hash) | No (each row has unique path) | +| Deletion | GC when unreferenced | Deleted with row | +| Use case | Serialized blobs, attachments | Zarr, HDF5, folders | + +### Implemented in `src/datajoint/builtin_types.py`: + +```python +@register_type +class ObjectType(AttributeType): + """Path-addressed storage for files and folders.""" + type_name = "object" + dtype = "json" + + def encode(self, value, *, key=None, store_name=None) -> dict: + # value can be bytes, str path, or Path + # key contains _schema, _table, _field for path construction + path, token = build_object_path(schema, table, field, primary_key, ext) + backend.put_buffer(content, path) # or put_folder for directories + return { + "path": path, + "store": store_name, + "size": size, + "ext": ext, + "is_dir": is_dir, + "timestamp": timestamp.isoformat(), + } + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + # Returns lazy handle for fsspec-based access + return ObjectRef.from_json(stored, backend=backend) +``` + +### ObjectRef Features: +- `ref.path` - Storage path +- `ref.read()` - Read file content +- `ref.open()` - Open as file handle +- `ref.fsmap` - For `zarr.open(ref.fsmap)` +- `ref.download(dest)` - Download to local path +- `ref.listdir()` / `ref.walk()` - For directories + +--- + ## Phase 3: User-Defined AttributeTypes **Status**: Partially complete @@ -319,8 +372,11 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: |------|--------|---------| | `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation | | `src/datajoint/heading.py` | ✅ | Simplified attribute properties | -| `src/datajoint/attribute_type.py` | ✅ | ContentType, XBlobType, type chain resolution | +| `src/datajoint/attribute_type.py` | ✅ | Base class, registry, type chain resolution | +| `src/datajoint/builtin_types.py` | ✅ | DJBlobType, ContentType, XBlobType, ObjectType | | `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) | +| `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access | +| `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path | | `src/datajoint/table.py` | ✅ | Type chain encoding on insert | | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | @@ -343,7 +399,7 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: ``` Layer 3: AttributeTypes (user-facing) - , , , , + , , , , , , ↓ encode() / ↑ decode() Layer 2: Core DataJoint Types @@ -354,6 +410,14 @@ Layer 1: Native Database Types FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc. ``` +**Built-in AttributeTypes:** +``` + → longblob (internal serialized storage) + → json (path-addressed, for Zarr/HDF5/folders) + → json (content-addressed with deduplication) + → json (external serialized with dedup) +``` + **Type Composition Example:** ``` → json (in DB) diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py index 303b84945..27d5d872f 100644 --- a/src/datajoint/builtin_types.py +++ b/src/datajoint/builtin_types.py @@ -9,6 +9,7 @@ - ````: Serialize Python objects to DataJoint's blob format (internal storage) - ````: Content-addressed storage with SHA256 deduplication - ````: External serialized blobs using content-addressed storage + - ````: Path-addressed storage for files/folders (Zarr, HDF5) Example - Creating a Custom Type: Here's how to define your own AttributeType, modeled after the built-in types:: @@ -237,3 +238,192 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: from . import blob return blob.unpack(stored, squeeze=False) + + +# ============================================================================= +# Path-Addressed Storage Types (OAS - Object-Augmented Schema) +# ============================================================================= + + +@register_type +class ObjectType(AttributeType): + """ + Path-addressed storage for files and folders. + + The ```` type provides managed file/folder storage where the path + is derived from the primary key: ``{schema}/{table}/objects/{pk}/{field}_{token}.{ext}`` + + Unlike ```` (content-addressed), each row has its own storage path, + and content is deleted when the row is deleted. This is ideal for: + + - Zarr arrays (hierarchical chunked data) + - HDF5 files + - Complex multi-file outputs + - Any content that shouldn't be deduplicated + + Example:: + + @schema + class Analysis(dj.Computed): + definition = ''' + -> Recording + --- + results : + ''' + + def make(self, key): + # Store a file + self.insert1({**key, 'results': '/path/to/results.zarr'}) + + # Fetch returns ObjectRef for lazy access + ref = (Analysis & key).fetch1('results') + ref.path # Storage path + ref.read() # Read file content + ref.fsmap # For zarr.open(ref.fsmap) + + Storage Structure: + Objects are stored at:: + + {store_root}/{schema}/{table}/objects/{pk}/{field}_{token}.ext + + The token ensures uniqueness even if content is replaced. + + Comparison with ````:: + + | Aspect | | | + |----------------|-------------------|---------------------| + | Addressing | Path (by PK) | Hash (by content) | + | Deduplication | No | Yes | + | Deletion | With row | GC when unreferenced| + | Use case | Zarr, HDF5 | Blobs, attachments | + + Note: + A store must be specified (````) unless a default store + is configured. Returns ``ObjectRef`` on fetch for lazy access. + """ + + type_name = "object" + dtype = "json" + + def encode( + self, + value: Any, + *, + key: dict | None = None, + store_name: str | None = None, + ) -> dict: + """ + Store content and return metadata. + + Args: + value: Content to store. Can be: + - bytes: Raw bytes to store as file + - str/Path: Path to local file or folder to upload + key: Dict containing context for path construction: + - _schema: Schema name + - _table: Table name + - _field: Field/attribute name + - Other entries are primary key values + store_name: Store to use. If None, uses default store. + + Returns: + Metadata dict suitable for ObjectRef.from_json() + """ + from datetime import datetime, timezone + from pathlib import Path + + from .content_registry import get_store_backend + from .storage import build_object_path + + # Extract context from key + key = key or {} + schema = key.pop("_schema", "unknown") + table = key.pop("_table", "unknown") + field = key.pop("_field", "data") + primary_key = {k: v for k, v in key.items() if not k.startswith("_")} + + # Determine content type and extension + is_dir = False + ext = None + size = None + + if isinstance(value, bytes): + content = value + size = len(content) + elif isinstance(value, (str, Path)): + source_path = Path(value) + if not source_path.exists(): + raise FileNotFoundError(f"Source path does not exist: {source_path}") + is_dir = source_path.is_dir() + ext = source_path.suffix if not is_dir else None + if is_dir: + # For directories, we'll upload later + content = None + else: + content = source_path.read_bytes() + size = len(content) + else: + raise TypeError(f" expects bytes or path, got {type(value).__name__}") + + # Build storage path + path, token = build_object_path( + schema=schema, + table=table, + field=field, + primary_key=primary_key, + ext=ext, + ) + + # Get storage backend + backend = get_store_backend(store_name) + + # Upload content + if is_dir: + # Upload directory recursively + source_path = Path(value) + backend.put_folder(str(source_path), path) + # Compute size by summing all files + size = sum(f.stat().st_size for f in source_path.rglob("*") if f.is_file()) + else: + backend.put_buffer(content, path) + + # Build metadata + timestamp = datetime.now(timezone.utc) + metadata = { + "path": path, + "store": store_name, + "size": size, + "ext": ext, + "is_dir": is_dir, + "timestamp": timestamp.isoformat(), + } + + return metadata + + def decode(self, stored: dict, *, key: dict | None = None) -> Any: + """ + Create ObjectRef handle for lazy access. + + Args: + stored: Metadata dict from database. + key: Primary key values (unused). + + Returns: + ObjectRef for accessing the stored content. + """ + from .content_registry import get_store_backend + from .objectref import ObjectRef + + store_name = stored.get("store") + backend = get_store_backend(store_name) + return ObjectRef.from_json(stored, backend=backend) + + def validate(self, value: Any) -> None: + """Validate that value is bytes or a valid path.""" + from pathlib import Path + + if isinstance(value, bytes): + return + if isinstance(value, (str, Path)): + return + raise TypeError(f" expects bytes or path, got {type(value).__name__}") From dd8c623ae15561b8483f1417f72c41ccbe64dc64 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 22:25:57 +0000 Subject: [PATCH 35/42] Remove migration phase from implementation plan Migration utilities are out of scope for now. This is a breaking change version - users will need to recreate tables with new types. Co-authored-by: dimitri-yatsenko --- .../tables/storage-types-implementation-plan.md | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index 22845c4ca..65567d03d 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -22,8 +22,7 @@ This plan describes the implementation of a three-layer type architecture for Da | Phase 3: User-Defined AttributeTypes | 🔲 Pending | AttachType/FilepathType pending | | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | | Phase 5: Garbage Collection | 🔲 Pending | | -| Phase 6: Migration Utilities | 🔲 Pending | | -| Phase 7: Documentation and Testing | 🔲 Pending | | +| Phase 6: Documentation and Testing | 🔲 Pending | | --- @@ -343,18 +342,7 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: --- -## Phase 6: Migration Utilities 🔲 - -**Status**: Pending - -### Key migrations needed: -1. Legacy `~external_{store}` tables → content-addressed storage -2. UUID-based external references → hash-based JSON metadata -3. Legacy `filepath@store` → new `` with ObjectRef - ---- - -## Phase 7: Documentation and Testing 🔲 +## Phase 6: Documentation and Testing 🔲 **Status**: Pending @@ -381,7 +369,6 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | | `src/datajoint/gc.py` | 🔲 | Garbage collection (to be created) | -| `src/datajoint/migrate.py` | 🔲 | Migration utilities | --- From e1b3be11b92347d78fb274ab464ffdc895368d18 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 22:43:55 +0000 Subject: [PATCH 36/42] Add staged insert documentation to implementation plan - Document staged_insert.py for direct object storage writes - Add flow comparison: normal insert vs staged insert - Include staged_insert.py in critical files summary Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index 65567d03d..cbdfb860e 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -193,6 +193,38 @@ class ObjectType(AttributeType): - `ref.download(dest)` - Download to local path - `ref.listdir()` / `ref.walk()` - For directories +### Staged Insert for Object Types + +For large objects like Zarr arrays, `staged_insert.py` provides direct writes to storage: + +```python +with table.staged_insert1 as staged: + # 1. Set primary key first (required for path construction) + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # 2. Get storage handle and write directly + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w') + z[:] = large_array + + # 3. On exit: metadata computed, record inserted +``` + +**Flow comparison:** + +| Normal Insert | Staged Insert | +|--------------|---------------| +| `ObjectType.encode()` uploads content | Direct writes via `staged.store()` | +| Single operation | Two-phase: write then finalize | +| Good for files/folders | Ideal for Zarr, HDF5, streaming | + +Both produce the same JSON metadata format compatible with `ObjectRef.from_json()`. + +**Key methods:** +- `staged.store(field, ext)` - Returns `FSMap` for Zarr/xarray +- `staged.open(field, ext)` - Returns file handle for binary writes +- `staged.fs` - Raw fsspec filesystem access + --- ## Phase 3: User-Defined AttributeTypes @@ -365,6 +397,7 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: | `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) | | `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access | | `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path | +| `src/datajoint/staged_insert.py` | ✅ | Staged insert for direct object storage writes | | `src/datajoint/table.py` | ✅ | Type chain encoding on insert | | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | From ca0b9149fe831f9317204ee75f3d68bef51e6bef Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 22:56:33 +0000 Subject: [PATCH 37/42] Implement Phase 3: AttachType, XAttachType, FilepathType Add remaining built-in AttributeTypes: - : Internal file attachment stored in longblob - : External file attachment via with deduplication - : Reference to existing file (no copy, returns ObjectRef) Update implementation plan to mark Phase 3 complete. Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 53 ++- src/datajoint/builtin_types.py | 317 ++++++++++++++++++ 2 files changed, 352 insertions(+), 18 deletions(-) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index cbdfb860e..6d6d2979b 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -19,7 +19,7 @@ This plan describes the implementation of a three-layer type architecture for Da | Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution | | Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table | | Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders | -| Phase 3: User-Defined AttributeTypes | 🔲 Pending | AttachType/FilepathType pending | +| Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType | | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | | Phase 5: Garbage Collection | 🔲 Pending | | | Phase 6: Documentation and Testing | 🔲 Pending | | @@ -227,14 +227,16 @@ Both produce the same JSON metadata format compatible with `ObjectRef.from_json( --- -## Phase 3: User-Defined AttributeTypes +## Phase 3: User-Defined AttributeTypes ✅ -**Status**: Partially complete +**Status**: Complete + +All built-in AttributeTypes are implemented in `src/datajoint/builtin_types.py`. ### 3.1 XBlobType ✅ -Implemented as shown above. Composes with ``. +External serialized blobs using content-addressed storage. Composes with ``. -### 3.2 AttachType and XAttachType 🔲 +### 3.2 AttachType ✅ ```python @register_type @@ -243,41 +245,53 @@ class AttachType(AttributeType): type_name = "attach" dtype = "longblob" - def encode(self, filepath, *, key=None) -> bytes: - path = Path(filepath) - return path.name.encode() + b"\0" + path.read_bytes() + def encode(self, filepath, *, key=None, store_name=None) -> bytes: + # Returns: filename (UTF-8) + null byte + contents + return path.name.encode("utf-8") + b"\x00" + path.read_bytes() def decode(self, stored, *, key=None) -> str: - filename, contents = stored.split(b"\0", 1) - # Write to download_path and return path + # Extracts to download_path, returns local path ... +``` + +### 3.3 XAttachType ✅ +```python @register_type class XAttachType(AttributeType): """External file attachment using content-addressed storage.""" type_name = "xattach" - dtype = "" - # Similar to AttachType but composes with content storage + dtype = "" # Composes with ContentType + # Same encode/decode as AttachType, but stored externally with dedup ``` -### 3.3 FilepathType 🔲 +### 3.4 FilepathType ✅ ```python @register_type class FilepathType(AttributeType): - """Portable relative path reference within configured stores.""" + """Reference to existing file in configured store.""" type_name = "filepath" dtype = "json" def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: - """Register reference to file in store.""" - return {'path': relative_path, 'store': store_name} + # Verifies file exists, returns metadata + return {'path': path, 'store': store_name, 'size': size, ...} def decode(self, stored: dict, *, key=None) -> ObjectRef: - """Return ObjectRef for lazy access.""" - return ObjectRef(store=stored['store'], path=stored['path']) + # Returns ObjectRef for lazy access + return ObjectRef.from_json(stored, backend=backend) ``` +### Type Comparison + +| Type | Storage | Copies File | Dedup | Returns | +|------|---------|-------------|-------|---------| +| `` | Database | Yes | No | Local path | +| `` | External | Yes | Yes | Local path | +| `` | Reference | No | N/A | ObjectRef | +| `` | External | Yes | No | ObjectRef | + --- ## Phase 4: Insert and Fetch Integration ✅ @@ -433,9 +447,12 @@ Layer 1: Native Database Types **Built-in AttributeTypes:** ``` → longblob (internal serialized storage) + → longblob (internal file attachment) → json (path-addressed, for Zarr/HDF5/folders) + → json (reference to existing file in store) → json (content-addressed with deduplication) → json (external serialized with dedup) + → json (external file attachment with dedup) ``` **Type Composition Example:** diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py index 27d5d872f..bb2bb20a6 100644 --- a/src/datajoint/builtin_types.py +++ b/src/datajoint/builtin_types.py @@ -10,6 +10,9 @@ - ````: Content-addressed storage with SHA256 deduplication - ````: External serialized blobs using content-addressed storage - ````: Path-addressed storage for files/folders (Zarr, HDF5) + - ````: Internal file attachment stored in database + - ````: External file attachment with deduplication + - ````: Reference to existing file in store Example - Creating a Custom Type: Here's how to define your own AttributeType, modeled after the built-in types:: @@ -427,3 +430,317 @@ def validate(self, value: Any) -> None: if isinstance(value, (str, Path)): return raise TypeError(f" expects bytes or path, got {type(value).__name__}") + + +# ============================================================================= +# File Attachment Types +# ============================================================================= + + +@register_type +class AttachType(AttributeType): + """ + Internal file attachment stored in database. + + The ```` type stores a file directly in the database as a ``LONGBLOB``. + The filename is preserved and the file is extracted to the configured + download path on fetch. + + Example:: + + @schema + class Documents(dj.Manual): + definition = ''' + doc_id : int + --- + report : + ''' + + # Insert a file + table.insert1({'doc_id': 1, 'report': '/path/to/report.pdf'}) + + # Fetch extracts to download_path and returns local path + local_path = (table & 'doc_id=1').fetch1('report') + + Storage Format: + The blob contains: ``filename\\0contents`` + - Filename (UTF-8 encoded) + null byte + raw file contents + + Note: + - For large files, use ```` (external storage with deduplication) + - For files that shouldn't be copied, use ```` + """ + + type_name = "attach" + dtype = "longblob" + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """ + Read file and encode as filename + contents. + + Args: + value: Path to file (str or Path). + key: Primary key values (unused). + store_name: Unused for internal storage. + + Returns: + Bytes: filename (UTF-8) + null byte + file contents + """ + from pathlib import Path + + path = Path(value) + if not path.exists(): + raise FileNotFoundError(f"Attachment file not found: {path}") + if path.is_dir(): + raise IsADirectoryError(f" does not support directories: {path}") + + filename = path.name + contents = path.read_bytes() + return filename.encode("utf-8") + b"\x00" + contents + + def decode(self, stored: bytes, *, key: dict | None = None) -> str: + """ + Extract file to download path and return local path. + + Args: + stored: Blob containing filename + null + contents. + key: Primary key values (unused). + + Returns: + Path to extracted file as string. + """ + from pathlib import Path + + from .settings import config + + # Split on first null byte + null_pos = stored.index(b"\x00") + filename = stored[:null_pos].decode("utf-8") + contents = stored[null_pos + 1 :] + + # Write to download path + download_path = Path(config.get("download_path", ".")) + download_path.mkdir(parents=True, exist_ok=True) + local_path = download_path / filename + + local_path.write_bytes(contents) + return str(local_path) + + def validate(self, value: Any) -> None: + """Validate that value is a valid file path.""" + from pathlib import Path + + if not isinstance(value, (str, Path)): + raise TypeError(f" expects a file path, got {type(value).__name__}") + + +@register_type +class XAttachType(AttributeType): + """ + External file attachment with content-addressed storage. + + The ```` type stores files externally using content-addressed + storage. Like ````, the filename is preserved and the file is + extracted on fetch. Unlike ````, files are stored externally + with automatic deduplication. + + Example:: + + @schema + class LargeDocuments(dj.Manual): + definition = ''' + doc_id : int + --- + dataset : + ''' + + # Insert a large file + table.insert1({'doc_id': 1, 'dataset': '/path/to/large_file.h5'}) + + # Fetch downloads and returns local path + local_path = (table & 'doc_id=1').fetch1('dataset') + + Type Composition: + ```` composes with ````:: + + Insert: file → read + encode filename → put_content() → JSON + Fetch: JSON → get_content() → extract → local path + + Comparison:: + + | Type | Storage | Deduplication | Best for | + |------------|----------|---------------|---------------------| + | | Database | No | Small files (<16MB) | + | | External | Yes | Large files | + """ + + type_name = "xattach" + dtype = "" # Composition: uses ContentType + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """ + Read file and encode as filename + contents. + + Args: + value: Path to file (str or Path). + key: Primary key values (unused). + store_name: Passed to ContentType for storage. + + Returns: + Bytes: filename (UTF-8) + null byte + file contents + """ + from pathlib import Path + + path = Path(value) + if not path.exists(): + raise FileNotFoundError(f"Attachment file not found: {path}") + if path.is_dir(): + raise IsADirectoryError(f" does not support directories: {path}") + + filename = path.name + contents = path.read_bytes() + return filename.encode("utf-8") + b"\x00" + contents + + def decode(self, stored: bytes, *, key: dict | None = None) -> str: + """ + Extract file to download path and return local path. + + Args: + stored: Bytes containing filename + null + contents. + key: Primary key values (unused). + + Returns: + Path to extracted file as string. + """ + from pathlib import Path + + from .settings import config + + # Split on first null byte + null_pos = stored.index(b"\x00") + filename = stored[:null_pos].decode("utf-8") + contents = stored[null_pos + 1 :] + + # Write to download path + download_path = Path(config.get("download_path", ".")) + download_path.mkdir(parents=True, exist_ok=True) + local_path = download_path / filename + + local_path.write_bytes(contents) + return str(local_path) + + def validate(self, value: Any) -> None: + """Validate that value is a valid file path.""" + from pathlib import Path + + if not isinstance(value, (str, Path)): + raise TypeError(f" expects a file path, got {type(value).__name__}") + + +# ============================================================================= +# Filepath Reference Type +# ============================================================================= + + +@register_type +class FilepathType(AttributeType): + """ + Reference to existing file in configured store. + + The ```` type stores a reference to a file that already + exists in the storage backend. Unlike ```` or ````, no + file copying occurs - only the path is recorded. + + This is useful when: + - Files are managed externally (e.g., by acquisition software) + - Files are too large to copy + - You want to reference shared datasets + + Example:: + + @schema + class Recordings(dj.Manual): + definition = ''' + recording_id : int + --- + raw_data : + ''' + + # Reference an existing file (no copy) + table.insert1({'recording_id': 1, 'raw_data': 'subject01/session001/data.bin'}) + + # Fetch returns ObjectRef for lazy access + ref = (table & 'recording_id=1').fetch1('raw_data') + ref.read() # Read file content + ref.download() # Download to local path + + Storage Format: + JSON metadata: ``{path, store}`` + + Warning: + The file must exist in the store at the specified path. + DataJoint does not manage the lifecycle of referenced files. + """ + + type_name = "filepath" + dtype = "json" + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> dict: + """ + Store path reference as JSON metadata. + + Args: + value: Relative path within the store (str). + key: Primary key values (unused). + store_name: Store where the file exists. + + Returns: + Metadata dict: {path, store} + """ + from datetime import datetime, timezone + + from .content_registry import get_store_backend + + path = str(value) + + # Optionally verify file exists + backend = get_store_backend(store_name) + if not backend.exists(path): + raise FileNotFoundError(f"File not found in store '{store_name or 'default'}': {path}") + + # Get file info + try: + size = backend.size(path) + except Exception: + size = None + + return { + "path": path, + "store": store_name, + "size": size, + "is_dir": False, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + def decode(self, stored: dict, *, key: dict | None = None) -> Any: + """ + Create ObjectRef handle for lazy access. + + Args: + stored: Metadata dict with path and store. + key: Primary key values (unused). + + Returns: + ObjectRef for accessing the file. + """ + from .content_registry import get_store_backend + from .objectref import ObjectRef + + store_name = stored.get("store") + backend = get_store_backend(store_name) + return ObjectRef.from_json(stored, backend=backend) + + def validate(self, value: Any) -> None: + """Validate that value is a path string.""" + if not isinstance(value, str): + raise TypeError(f" expects a path string, got {type(value).__name__}") From d0f56144a38c2b79aaca79b6059400a6703784e5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Dec 2025 23:14:20 +0000 Subject: [PATCH 38/42] Implement Phase 5 (GC) and Phase 6 (Tests) Add garbage collection module (gc.py) for content-addressed storage: - scan_references() to find content hashes in schemas - list_stored_content() to enumerate _content/ directory - scan() for orphan detection without deletion - collect() for orphan removal with dry_run option - format_stats() for human-readable output Add test files: - test_content_storage.py for content_registry.py functions - test_type_composition.py for type chain encoding/decoding - test_gc.py for garbage collection Update implementation plan to mark all phases complete. Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 81 ++-- src/datajoint/gc.py | 327 ++++++++++++++++ tests/test_content_storage.py | 231 ++++++++++++ tests/test_gc.py | 214 +++++++++++ tests/test_type_composition.py | 352 ++++++++++++++++++ 5 files changed, 1158 insertions(+), 47 deletions(-) create mode 100644 src/datajoint/gc.py create mode 100644 tests/test_content_storage.py create mode 100644 tests/test_gc.py create mode 100644 tests/test_type_composition.py diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index 6d6d2979b..8ce582f57 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -21,8 +21,8 @@ This plan describes the implementation of a three-layer type architecture for Da | Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders | | Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType | | Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | -| Phase 5: Garbage Collection | 🔲 Pending | | -| Phase 6: Documentation and Testing | 🔲 Pending | | +| Phase 5: Garbage Collection | ✅ Complete | gc.py with scan/collect functions | +| Phase 6: Documentation and Testing | ✅ Complete | Test files for all new types | --- @@ -337,66 +337,50 @@ def _get(connection, attr, data, squeeze, download_path): --- -## Phase 5: Garbage Collection 🔲 +## Phase 5: Garbage Collection ✅ -**Status**: Pending - -### Design (updated for function-based approach): +**Status**: Complete -Since we don't have a registry table, GC works by scanning: +### Implemented in `src/datajoint/gc.py`: ```python -def scan_content_references(schemas: list) -> set[tuple[str, str]]: - """ - Scan all schemas for content references. - - Returns: - Set of (content_hash, store) tuples that are referenced - """ - referenced = set() - for schema in schemas: - for table in schema.tables: - for attr in table.heading.attributes: - if uses_content_storage(attr): - # Fetch all JSON metadata from this column - for row in table.fetch(attr.name): - if isinstance(row, dict) and 'hash' in row: - referenced.add((row['hash'], row.get('store'))) - return referenced - -def list_stored_content(store_name: str) -> set[str]: - """List all content hashes in a store by scanning _content/ directory.""" - ... - -def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: - """ - Remove unreferenced content from storage. +import datajoint as dj - Returns: - Stats: {'scanned': N, 'orphaned': M, 'deleted': K, 'bytes_freed': B} - """ - referenced = scan_content_references(schemas) - stored = list_stored_content(store_name) - orphaned = stored - {h for h, s in referenced if s == store_name} +# Scan schemas and find orphaned content +stats = dj.gc.scan(schema1, schema2, store_name='mystore') - if not dry_run: - for content_hash in orphaned: - delete_content(content_hash, store_name) +# Remove orphaned content (dry_run=False to actually delete) +stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) - return {'orphaned': len(orphaned), ...} +# Format statistics for display +print(dj.gc.format_stats(stats)) ``` +**Key functions:** +- `scan_references(*schemas, store_name=None)` - Scan tables for content hashes +- `list_stored_content(store_name=None)` - List all content in `_content/` directory +- `scan(*schemas, store_name=None)` - Find orphaned content without deleting +- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content +- `format_stats(stats)` - Human-readable statistics output + +**GC Process:** +1. Scan all tables in provided schemas for content-type attributes +2. Extract content hashes from JSON metadata in those columns +3. Scan storage `_content/` directory for all stored hashes +4. Compute orphaned = stored - referenced +5. Optionally delete orphaned content (when `dry_run=False`) + --- -## Phase 6: Documentation and Testing 🔲 +## Phase 6: Documentation and Testing ✅ -**Status**: Pending +**Status**: Complete -### Test files to create: +### Test files created: - `tests/test_content_storage.py` - Content-addressed storage functions -- `tests/test_xblob.py` - XBlobType roundtrip - `tests/test_type_composition.py` - Type chain encoding/decoding - `tests/test_gc.py` - Garbage collection +- `tests/test_attribute_type.py` - AttributeType registry and DJBlobType (existing) --- @@ -415,7 +399,10 @@ def garbage_collect(schemas: list, store_name: str, dry_run=True) -> dict: | `src/datajoint/table.py` | ✅ | Type chain encoding on insert | | `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | | `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | -| `src/datajoint/gc.py` | 🔲 | Garbage collection (to be created) | +| `src/datajoint/gc.py` | ✅ | Garbage collection for content storage | +| `tests/test_content_storage.py` | ✅ | Tests for content_registry.py | +| `tests/test_type_composition.py` | ✅ | Tests for type chain encoding/decoding | +| `tests/test_gc.py` | ✅ | Tests for garbage collection | --- diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py new file mode 100644 index 000000000..e862287fc --- /dev/null +++ b/src/datajoint/gc.py @@ -0,0 +1,327 @@ +""" +Garbage collection for content-addressed storage. + +This module provides utilities to identify and remove orphaned content +from external storage. Content becomes orphaned when all database rows +referencing it are deleted. + +Usage: + import datajoint as dj + + # Scan schemas and find orphaned content + stats = dj.gc.scan(schema1, schema2, store_name='mystore') + + # Remove orphaned content (dry_run=False to actually delete) + stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) +""" + +from __future__ import annotations + +import json +import logging +from typing import TYPE_CHECKING, Any + +from .content_registry import delete_content, get_store_backend +from .errors import DataJointError + +if TYPE_CHECKING: + from .schemas import Schema + +logger = logging.getLogger(__name__.split(".")[0]) + + +def _uses_content_storage(attr) -> bool: + """ + Check if an attribute uses content-addressed storage. + + This includes types that compose with : + - directly + - (composes with ) + - (composes with ) + + Args: + attr: Attribute from table heading + + Returns: + True if the attribute stores content hashes + """ + if not attr.adapter: + return False + + # Check if this type or its composition chain uses content storage + type_name = getattr(attr.adapter, "type_name", "") + return type_name in ("content", "xblob", "xattach") + + +def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: + """ + Extract content references from a stored value. + + Args: + value: The stored value (could be JSON string or dict) + + Returns: + List of (content_hash, store_name) tuples + """ + refs = [] + + if value is None: + return refs + + # Parse JSON if string + if isinstance(value, str): + try: + value = json.loads(value) + except (json.JSONDecodeError, TypeError): + return refs + + # Extract hash from dict + if isinstance(value, dict) and "hash" in value: + refs.append((value["hash"], value.get("store"))) + + return refs + + +def scan_references( + *schemas: "Schema", + store_name: str | None = None, + verbose: bool = False, +) -> set[str]: + """ + Scan schemas for content references. + + Examines all tables in the given schemas and extracts content hashes + from columns that use content-addressed storage (, , ). + + Args: + *schemas: Schema instances to scan + store_name: Only include references to this store (None = all stores) + verbose: Print progress information + + Returns: + Set of content hashes that are referenced + """ + referenced: set[str] = set() + + for schema in schemas: + if verbose: + logger.info(f"Scanning schema: {schema.database}") + + # Get all tables in schema + for table_name in schema.list_tables(): + try: + # Get table class + table = schema.spawn_table(table_name) + + # Check each attribute for content storage + for attr_name, attr in table.heading.attributes.items(): + if not _uses_content_storage(attr): + continue + + if verbose: + logger.info(f" Scanning {table_name}.{attr_name}") + + # Fetch all values for this attribute + # Use raw fetch to get JSON strings + try: + values = table.fetch(attr_name) + for value in values: + for content_hash, ref_store in _extract_content_refs(value): + # Filter by store if specified + if store_name is None or ref_store == store_name: + referenced.add(content_hash) + except Exception as e: + logger.warning(f"Error scanning {table_name}.{attr_name}: {e}") + + except Exception as e: + logger.warning(f"Error accessing table {table_name}: {e}") + + return referenced + + +def list_stored_content(store_name: str | None = None) -> dict[str, int]: + """ + List all content hashes in storage. + + Scans the _content/ directory in the specified store and returns + all content hashes found. + + Args: + store_name: Store to scan (None = default store) + + Returns: + Dict mapping content_hash to size in bytes + """ + backend = get_store_backend(store_name) + stored: dict[str, int] = {} + + # Content is stored at _content/{hash[:2]}/{hash[2:4]}/{hash} + content_prefix = "_content/" + + try: + # List all files under _content/ + full_prefix = backend._full_path(content_prefix) + + for root, dirs, files in backend.fs.walk(full_prefix): + for filename in files: + # Skip manifest files + if filename.endswith(".manifest.json"): + continue + + # The filename is the full hash + content_hash = filename + + # Validate it looks like a hash (64 hex chars) + if len(content_hash) == 64 and all(c in "0123456789abcdef" for c in content_hash): + try: + file_path = f"{root}/{filename}" + size = backend.fs.size(file_path) + stored[content_hash] = size + except Exception: + stored[content_hash] = 0 + + except FileNotFoundError: + # No _content/ directory exists yet + pass + except Exception as e: + logger.warning(f"Error listing stored content: {e}") + + return stored + + +def scan( + *schemas: "Schema", + store_name: str | None = None, + verbose: bool = False, +) -> dict[str, Any]: + """ + Scan for orphaned content without deleting. + + Args: + *schemas: Schema instances to scan + store_name: Store to check (None = default store) + verbose: Print progress information + + Returns: + Dict with scan statistics: + - referenced: Number of content items referenced in database + - stored: Number of content items in storage + - orphaned: Number of unreferenced content items + - orphaned_bytes: Total size of orphaned content + - orphaned_hashes: List of orphaned content hashes + """ + if not schemas: + raise DataJointError("At least one schema must be provided") + + # Find all referenced content + referenced = scan_references(*schemas, store_name=store_name, verbose=verbose) + + # Find all stored content + stored = list_stored_content(store_name) + + # Find orphaned content + orphaned_hashes = set(stored.keys()) - referenced + orphaned_bytes = sum(stored.get(h, 0) for h in orphaned_hashes) + + return { + "referenced": len(referenced), + "stored": len(stored), + "orphaned": len(orphaned_hashes), + "orphaned_bytes": orphaned_bytes, + "orphaned_hashes": sorted(orphaned_hashes), + } + + +def collect( + *schemas: "Schema", + store_name: str | None = None, + dry_run: bool = True, + verbose: bool = False, +) -> dict[str, Any]: + """ + Remove orphaned content from storage. + + Scans the given schemas for content references, then removes any + content in storage that is not referenced. + + Args: + *schemas: Schema instances to scan + store_name: Store to clean (None = default store) + dry_run: If True, report what would be deleted without deleting + verbose: Print progress information + + Returns: + Dict with collection statistics: + - referenced: Number of content items referenced in database + - stored: Number of content items in storage + - orphaned: Number of unreferenced content items + - deleted: Number of items deleted (0 if dry_run) + - bytes_freed: Bytes freed (0 if dry_run) + - errors: Number of deletion errors + """ + # First scan to find orphaned content + stats = scan(*schemas, store_name=store_name, verbose=verbose) + + deleted = 0 + bytes_freed = 0 + errors = 0 + + if not dry_run and stats["orphaned"] > 0: + stored = list_stored_content(store_name) + + for content_hash in stats["orphaned_hashes"]: + try: + size = stored.get(content_hash, 0) + if delete_content(content_hash, store_name): + deleted += 1 + bytes_freed += size + if verbose: + logger.info(f"Deleted: {content_hash[:16]}... ({size} bytes)") + except Exception as e: + errors += 1 + logger.warning(f"Failed to delete {content_hash[:16]}...: {e}") + + return { + "referenced": stats["referenced"], + "stored": stats["stored"], + "orphaned": stats["orphaned"], + "deleted": deleted, + "bytes_freed": bytes_freed, + "errors": errors, + "dry_run": dry_run, + } + + +def format_stats(stats: dict[str, Any]) -> str: + """ + Format GC statistics as a human-readable string. + + Args: + stats: Statistics dict from scan() or collect() + + Returns: + Formatted string + """ + lines = [ + "Content Storage Statistics:", + f" Referenced in database: {stats['referenced']}", + f" Stored in backend: {stats['stored']}", + f" Orphaned (unreferenced): {stats['orphaned']}", + ] + + if "orphaned_bytes" in stats: + size_mb = stats["orphaned_bytes"] / (1024 * 1024) + lines.append(f" Orphaned size: {size_mb:.2f} MB") + + if "deleted" in stats: + lines.append("") + if stats.get("dry_run", True): + lines.append(" [DRY RUN - no changes made]") + else: + lines.append(f" Deleted: {stats['deleted']}") + freed_mb = stats["bytes_freed"] / (1024 * 1024) + lines.append(f" Bytes freed: {freed_mb:.2f} MB") + if stats.get("errors", 0) > 0: + lines.append(f" Errors: {stats['errors']}") + + return "\n".join(lines) diff --git a/tests/test_content_storage.py b/tests/test_content_storage.py new file mode 100644 index 000000000..e6d0f14cc --- /dev/null +++ b/tests/test_content_storage.py @@ -0,0 +1,231 @@ +""" +Tests for content-addressed storage (content_registry.py). +""" + +import hashlib +from unittest.mock import MagicMock, patch + +import pytest + +from datajoint.content_registry import ( + build_content_path, + compute_content_hash, + content_exists, + delete_content, + get_content, + get_content_size, + put_content, +) +from datajoint.errors import DataJointError + + +class TestComputeContentHash: + """Tests for compute_content_hash function.""" + + def test_computes_sha256(self): + """Test that SHA256 hash is computed correctly.""" + data = b"Hello, World!" + result = compute_content_hash(data) + + # Verify against known SHA256 hash + expected = hashlib.sha256(data).hexdigest() + assert result == expected + assert len(result) == 64 # SHA256 produces 64 hex chars + + def test_empty_bytes(self): + """Test hashing empty bytes.""" + result = compute_content_hash(b"") + expected = hashlib.sha256(b"").hexdigest() + assert result == expected + + def test_different_content_different_hash(self): + """Test that different content produces different hashes.""" + hash1 = compute_content_hash(b"content1") + hash2 = compute_content_hash(b"content2") + assert hash1 != hash2 + + def test_same_content_same_hash(self): + """Test that same content produces same hash.""" + data = b"identical content" + hash1 = compute_content_hash(data) + hash2 = compute_content_hash(data) + assert hash1 == hash2 + + +class TestBuildContentPath: + """Tests for build_content_path function.""" + + def test_builds_hierarchical_path(self): + """Test that path is built with proper hierarchy.""" + # Example hash: abcdef... + test_hash = "abcdef0123456789" * 4 # 64 chars + result = build_content_path(test_hash) + + # Path should be _content/{hash[:2]}/{hash[2:4]}/{hash} + assert result == f"_content/ab/cd/{test_hash}" + + def test_rejects_invalid_hash_length(self): + """Test that invalid hash length raises error.""" + with pytest.raises(DataJointError, match="Invalid content hash length"): + build_content_path("tooshort") + + with pytest.raises(DataJointError, match="Invalid content hash length"): + build_content_path("a" * 65) # Too long + + def test_real_hash_path(self): + """Test path building with a real computed hash.""" + data = b"test content" + content_hash = compute_content_hash(data) + path = build_content_path(content_hash) + + # Verify structure + parts = path.split("/") + assert parts[0] == "_content" + assert len(parts[1]) == 2 + assert len(parts[2]) == 2 + assert len(parts[3]) == 64 + assert parts[1] == content_hash[:2] + assert parts[2] == content_hash[2:4] + assert parts[3] == content_hash + + +class TestPutContent: + """Tests for put_content function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_stores_new_content(self, mock_get_backend): + """Test storing new content.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + data = b"new content" + result = put_content(data, store_name="test_store") + + # Verify return value + assert "hash" in result + assert result["hash"] == compute_content_hash(data) + assert result["store"] == "test_store" + assert result["size"] == len(data) + + # Verify backend was called + mock_backend.put_buffer.assert_called_once() + + @patch("datajoint.content_registry.get_store_backend") + def test_deduplicates_existing_content(self, mock_get_backend): + """Test that existing content is not re-uploaded.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True # Content already exists + mock_get_backend.return_value = mock_backend + + data = b"existing content" + result = put_content(data, store_name="test_store") + + # Verify return value is still correct + assert result["hash"] == compute_content_hash(data) + assert result["size"] == len(data) + + # Verify put_buffer was NOT called (deduplication) + mock_backend.put_buffer.assert_not_called() + + +class TestGetContent: + """Tests for get_content function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_retrieves_content(self, mock_get_backend): + """Test retrieving content by hash.""" + data = b"stored content" + content_hash = compute_content_hash(data) + + mock_backend = MagicMock() + mock_backend.get_buffer.return_value = data + mock_get_backend.return_value = mock_backend + + result = get_content(content_hash, store_name="test_store") + + assert result == data + + @patch("datajoint.content_registry.get_store_backend") + def test_verifies_hash(self, mock_get_backend): + """Test that hash is verified on retrieval.""" + data = b"original content" + content_hash = compute_content_hash(data) + + # Return corrupted data + mock_backend = MagicMock() + mock_backend.get_buffer.return_value = b"corrupted content" + mock_get_backend.return_value = mock_backend + + with pytest.raises(DataJointError, match="Content hash mismatch"): + get_content(content_hash, store_name="test_store") + + +class TestContentExists: + """Tests for content_exists function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_true_when_exists(self, mock_get_backend): + """Test that True is returned when content exists.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + assert content_exists(content_hash, store_name="test_store") is True + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_false_when_not_exists(self, mock_get_backend): + """Test that False is returned when content doesn't exist.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + assert content_exists(content_hash, store_name="test_store") is False + + +class TestDeleteContent: + """Tests for delete_content function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_deletes_existing_content(self, mock_get_backend): + """Test deleting existing content.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + result = delete_content(content_hash, store_name="test_store") + + assert result is True + mock_backend.remove.assert_called_once() + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_false_for_nonexistent(self, mock_get_backend): + """Test that False is returned when content doesn't exist.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + result = delete_content(content_hash, store_name="test_store") + + assert result is False + mock_backend.remove.assert_not_called() + + +class TestGetContentSize: + """Tests for get_content_size function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_size(self, mock_get_backend): + """Test getting content size.""" + mock_backend = MagicMock() + mock_backend.size.return_value = 1024 + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + result = get_content_size(content_hash, store_name="test_store") + + assert result == 1024 diff --git a/tests/test_gc.py b/tests/test_gc.py new file mode 100644 index 000000000..5af71a0a9 --- /dev/null +++ b/tests/test_gc.py @@ -0,0 +1,214 @@ +""" +Tests for garbage collection (gc.py). +""" + +from unittest.mock import MagicMock, patch + +import pytest + +from datajoint import gc +from datajoint.errors import DataJointError + + +class TestUsesContentStorage: + """Tests for _uses_content_storage helper function.""" + + def test_returns_false_for_no_adapter(self): + """Test that False is returned when attribute has no adapter.""" + attr = MagicMock() + attr.adapter = None + + assert gc._uses_content_storage(attr) is False + + def test_returns_true_for_content_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "content" + + assert gc._uses_content_storage(attr) is True + + def test_returns_true_for_xblob_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "xblob" + + assert gc._uses_content_storage(attr) is True + + def test_returns_true_for_xattach_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "xattach" + + assert gc._uses_content_storage(attr) is True + + def test_returns_false_for_other_types(self): + """Test that False is returned for non-content types.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "djblob" + + assert gc._uses_content_storage(attr) is False + + +class TestExtractContentRefs: + """Tests for _extract_content_refs helper function.""" + + def test_returns_empty_for_none(self): + """Test that empty list is returned for None value.""" + assert gc._extract_content_refs(None) == [] + + def test_parses_json_string(self): + """Test parsing JSON string with hash.""" + value = '{"hash": "abc123", "store": "mystore"}' + refs = gc._extract_content_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("abc123", "mystore") + + def test_parses_dict_directly(self): + """Test parsing dict with hash.""" + value = {"hash": "def456", "store": None} + refs = gc._extract_content_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("def456", None) + + def test_returns_empty_for_invalid_json(self): + """Test that empty list is returned for invalid JSON.""" + assert gc._extract_content_refs("not json") == [] + + def test_returns_empty_for_dict_without_hash(self): + """Test that empty list is returned for dict without hash key.""" + assert gc._extract_content_refs({"other": "data"}) == [] + + +class TestScan: + """Tests for scan function.""" + + def test_requires_at_least_one_schema(self): + """Test that at least one schema is required.""" + with pytest.raises(DataJointError, match="At least one schema must be provided"): + gc.scan() + + @patch("datajoint.gc.scan_references") + @patch("datajoint.gc.list_stored_content") + def test_returns_stats(self, mock_list_stored, mock_scan_refs): + """Test that scan returns proper statistics.""" + # Mock referenced hashes + mock_scan_refs.return_value = {"hash1", "hash2"} + + # Mock stored content (hash1 referenced, hash3 orphaned) + mock_list_stored.return_value = { + "hash1": 100, + "hash3": 200, + } + + mock_schema = MagicMock() + stats = gc.scan(mock_schema, store_name="test_store") + + assert stats["referenced"] == 2 + assert stats["stored"] == 2 + assert stats["orphaned"] == 1 + assert stats["orphaned_bytes"] == 200 + assert "hash3" in stats["orphaned_hashes"] + + +class TestCollect: + """Tests for collect function.""" + + @patch("datajoint.gc.scan") + def test_dry_run_does_not_delete(self, mock_scan): + """Test that dry_run=True doesn't delete anything.""" + mock_scan.return_value = { + "referenced": 1, + "stored": 2, + "orphaned": 1, + "orphaned_bytes": 100, + "orphaned_hashes": ["orphan_hash"], + } + + mock_schema = MagicMock() + stats = gc.collect(mock_schema, store_name="test_store", dry_run=True) + + assert stats["deleted"] == 0 + assert stats["bytes_freed"] == 0 + assert stats["dry_run"] is True + + @patch("datajoint.gc.delete_content") + @patch("datajoint.gc.list_stored_content") + @patch("datajoint.gc.scan") + def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete): + """Test that orphaned content is deleted when dry_run=False.""" + mock_scan.return_value = { + "referenced": 1, + "stored": 2, + "orphaned": 1, + "orphaned_bytes": 100, + "orphaned_hashes": ["orphan_hash"], + } + mock_list_stored.return_value = {"orphan_hash": 100} + mock_delete.return_value = True + + mock_schema = MagicMock() + stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) + + assert stats["deleted"] == 1 + assert stats["bytes_freed"] == 100 + assert stats["dry_run"] is False + mock_delete.assert_called_once_with("orphan_hash", "test_store") + + +class TestFormatStats: + """Tests for format_stats function.""" + + def test_formats_scan_stats(self): + """Test formatting scan statistics.""" + stats = { + "referenced": 10, + "stored": 15, + "orphaned": 5, + "orphaned_bytes": 1024 * 1024, # 1 MB + } + + result = gc.format_stats(stats) + + assert "Referenced in database: 10" in result + assert "Stored in backend: 15" in result + assert "Orphaned (unreferenced): 5" in result + assert "1.00 MB" in result + + def test_formats_collect_stats_dry_run(self): + """Test formatting collect statistics with dry_run.""" + stats = { + "referenced": 10, + "stored": 15, + "orphaned": 5, + "deleted": 0, + "bytes_freed": 0, + "dry_run": True, + } + + result = gc.format_stats(stats) + + assert "DRY RUN" in result + + def test_formats_collect_stats_actual(self): + """Test formatting collect statistics after actual deletion.""" + stats = { + "referenced": 10, + "stored": 15, + "orphaned": 5, + "deleted": 3, + "bytes_freed": 2 * 1024 * 1024, # 2 MB + "errors": 2, + "dry_run": False, + } + + result = gc.format_stats(stats) + + assert "Deleted: 3" in result + assert "2.00 MB" in result + assert "Errors: 2" in result diff --git a/tests/test_type_composition.py b/tests/test_type_composition.py new file mode 100644 index 000000000..0b51b3d68 --- /dev/null +++ b/tests/test_type_composition.py @@ -0,0 +1,352 @@ +""" +Tests for type composition (type chain encoding/decoding). + +This tests the → json composition pattern +and similar type chains. +""" + +from datajoint.attribute_type import ( + AttributeType, + _type_registry, + register_type, + resolve_dtype, +) + + +class TestTypeChainResolution: + """Tests for resolving type chains.""" + + def setup_method(self): + """Clear test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_single_type_chain(self): + """Test resolving a single-type chain.""" + + @register_type + class TestSingle(AttributeType): + type_name = "test_single" + dtype = "varchar(100)" + + def encode(self, value, *, key=None, store_name=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].type_name == "test_single" + assert store is None + + def test_two_type_chain(self): + """Test resolving a two-type chain.""" + + @register_type + class TestInner(AttributeType): + type_name = "test_inner" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestOuter(AttributeType): + type_name = "test_outer" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "longblob" + assert len(chain) == 2 + assert chain[0].type_name == "test_outer" + assert chain[1].type_name == "test_inner" + + def test_three_type_chain(self): + """Test resolving a three-type chain.""" + + @register_type + class TestBase(AttributeType): + type_name = "test_base" + dtype = "json" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestMiddle(AttributeType): + type_name = "test_middle" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestTop(AttributeType): + type_name = "test_top" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 3 + assert chain[0].type_name == "test_top" + assert chain[1].type_name == "test_middle" + assert chain[2].type_name == "test_base" + + +class TestTypeChainEncodeDecode: + """Tests for encode/decode through type chains.""" + + def setup_method(self): + """Clear test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_encode_order(self): + """Test that encode is applied outer → inner.""" + encode_order = [] + + @register_type + class TestInnerEnc(AttributeType): + type_name = "test_inner_enc" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + encode_order.append("inner") + return value + b"_inner" + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestOuterEnc(AttributeType): + type_name = "test_outer_enc" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + encode_order.append("outer") + return value + b"_outer" + + def decode(self, stored, *, key=None): + return stored + + _, chain, _ = resolve_dtype("") + + # Apply encode in order: outer first, then inner + value = b"start" + for attr_type in chain: + value = attr_type.encode(value) + + assert encode_order == ["outer", "inner"] + assert value == b"start_outer_inner" + + def test_decode_order(self): + """Test that decode is applied inner → outer (reverse of encode).""" + decode_order = [] + + @register_type + class TestInnerDec(AttributeType): + type_name = "test_inner_dec" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + decode_order.append("inner") + return stored.replace(b"_inner", b"") + + @register_type + class TestOuterDec(AttributeType): + type_name = "test_outer_dec" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + decode_order.append("outer") + return stored.replace(b"_outer", b"") + + _, chain, _ = resolve_dtype("") + + # Apply decode in reverse order: inner first, then outer + value = b"start_outer_inner" + for attr_type in reversed(chain): + value = attr_type.decode(value) + + assert decode_order == ["inner", "outer"] + assert value == b"start" + + def test_roundtrip(self): + """Test encode/decode roundtrip through a type chain.""" + + @register_type + class TestInnerRt(AttributeType): + type_name = "test_inner_rt" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + # Compress (just add prefix for testing) + return b"COMPRESSED:" + value + + def decode(self, stored, *, key=None): + # Decompress + return stored.replace(b"COMPRESSED:", b"") + + @register_type + class TestOuterRt(AttributeType): + type_name = "test_outer_rt" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + # Serialize (just encode string for testing) + return str(value).encode("utf-8") + + def decode(self, stored, *, key=None): + # Deserialize + return stored.decode("utf-8") + + _, chain, _ = resolve_dtype("") + + # Original value + original = "test data" + + # Encode: outer → inner + encoded = original + for attr_type in chain: + encoded = attr_type.encode(encoded) + + assert encoded == b"COMPRESSED:test data" + + # Decode: inner → outer (reversed) + decoded = encoded + for attr_type in reversed(chain): + decoded = attr_type.decode(decoded) + + assert decoded == original + + +class TestBuiltinTypeComposition: + """Tests for built-in type composition.""" + + def test_xblob_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 2 + assert chain[0].type_name == "xblob" + assert chain[1].type_name == "content" + + def test_xattach_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 2 + assert chain[0].type_name == "xattach" + assert chain[1].type_name == "content" + + def test_djblob_resolves_to_longblob(self): + """Test that → longblob (no chain).""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "longblob" + assert len(chain) == 1 + assert chain[0].type_name == "djblob" + + def test_content_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].type_name == "content" + + def test_object_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].type_name == "object" + + def test_attach_resolves_to_longblob(self): + """Test that → longblob.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "longblob" + assert len(chain) == 1 + assert chain[0].type_name == "attach" + + def test_filepath_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].type_name == "filepath" + + +class TestStoreNameParsing: + """Tests for store name parsing in type specs.""" + + def test_type_with_store(self): + """Test parsing type with store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert store == "mystore" + + def test_type_without_store(self): + """Test parsing type without store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert store is None + + def test_filepath_with_store(self): + """Test parsing filepath with store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert store == "s3store" From 73535de8790491dee128ba5cffde2be5114f2107 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 02:22:38 +0000 Subject: [PATCH 39/42] Add object type garbage collection support Extend gc.py to handle both storage patterns: - Content-addressed storage: , , - Path-addressed storage: New functions added: - _uses_object_storage() - detect object type attributes - _extract_object_refs() - extract path refs from JSON - scan_object_references() - scan schemas for object paths - list_stored_objects() - list all objects in storage - delete_object() - delete object directory tree Updated scan() and collect() to handle both storage types, with combined and per-type statistics in the output. Updated tests for new statistics format. Co-authored-by: dimitri-yatsenko --- .../storage-types-implementation-plan.md | 31 +- src/datajoint/gc.py | 360 +++++++++++++++--- tests/test_gc.py | 143 ++++++- 3 files changed, 467 insertions(+), 67 deletions(-) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md index 8ce582f57..c15a2292c 100644 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -346,29 +346,42 @@ def _get(connection, attr, data, squeeze, download_path): ```python import datajoint as dj -# Scan schemas and find orphaned content +# Scan schemas and find orphaned content/objects stats = dj.gc.scan(schema1, schema2, store_name='mystore') -# Remove orphaned content (dry_run=False to actually delete) +# Remove orphaned content/objects (dry_run=False to actually delete) stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) # Format statistics for display print(dj.gc.format_stats(stats)) ``` +**Supported storage patterns:** + +1. **Content-Addressed Storage** (``, ``, ``): + - Stored at: `_content/{hash[:2]}/{hash[2:4]}/{hash}` + - Referenced by SHA256 hash in JSON metadata + +2. **Path-Addressed Storage** (``): + - Stored at: `{schema}/{table}/objects/{pk}/{field}_{token}/` + - Referenced by path in JSON metadata + **Key functions:** - `scan_references(*schemas, store_name=None)` - Scan tables for content hashes +- `scan_object_references(*schemas, store_name=None)` - Scan tables for object paths - `list_stored_content(store_name=None)` - List all content in `_content/` directory -- `scan(*schemas, store_name=None)` - Find orphaned content without deleting -- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content +- `list_stored_objects(store_name=None)` - List all objects in `*/objects/` directories +- `scan(*schemas, store_name=None)` - Find orphaned content/objects without deleting +- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content/objects +- `delete_object(path, store_name=None)` - Delete an object directory - `format_stats(stats)` - Human-readable statistics output **GC Process:** -1. Scan all tables in provided schemas for content-type attributes -2. Extract content hashes from JSON metadata in those columns -3. Scan storage `_content/` directory for all stored hashes -4. Compute orphaned = stored - referenced -5. Optionally delete orphaned content (when `dry_run=False`) +1. Scan all tables in provided schemas for content-type and object-type attributes +2. Extract content hashes and object paths from JSON metadata columns +3. Scan storage for all stored content (`_content/`) and objects (`*/objects/`) +4. Compute orphaned = stored - referenced (for both types) +5. Optionally delete orphaned items (when `dry_run=False`) --- diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index e862287fc..e0b7aaafe 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -1,10 +1,17 @@ """ -Garbage collection for content-addressed storage. +Garbage collection for external storage. This module provides utilities to identify and remove orphaned content from external storage. Content becomes orphaned when all database rows referencing it are deleted. +Supports two storage patterns: +- Content-addressed storage: , , + Stored at: _content/{hash[:2]}/{hash[2:4]}/{hash} + +- Path-addressed storage: + Stored at: {schema}/{table}/objects/{pk}/{field}_{token}/ + Usage: import datajoint as dj @@ -53,6 +60,23 @@ def _uses_content_storage(attr) -> bool: return type_name in ("content", "xblob", "xattach") +def _uses_object_storage(attr) -> bool: + """ + Check if an attribute uses path-addressed object storage. + + Args: + attr: Attribute from table heading + + Returns: + True if the attribute stores object paths + """ + if not attr.adapter: + return False + + type_name = getattr(attr.adapter, "type_name", "") + return type_name == "object" + + def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: """ Extract content references from a stored value. @@ -82,6 +106,35 @@ def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: return refs +def _extract_object_refs(value: Any) -> list[tuple[str, str | None]]: + """ + Extract object path references from a stored value. + + Args: + value: The stored value (could be JSON string or dict) + + Returns: + List of (path, store_name) tuples + """ + refs = [] + + if value is None: + return refs + + # Parse JSON if string + if isinstance(value, str): + try: + value = json.loads(value) + except (json.JSONDecodeError, TypeError): + return refs + + # Extract path from dict + if isinstance(value, dict) and "path" in value: + refs.append((value["path"], value.get("store"))) + + return refs + + def scan_references( *schemas: "Schema", store_name: str | None = None, @@ -139,6 +192,62 @@ def scan_references( return referenced +def scan_object_references( + *schemas: "Schema", + store_name: str | None = None, + verbose: bool = False, +) -> set[str]: + """ + Scan schemas for object path references. + + Examines all tables in the given schemas and extracts object paths + from columns that use path-addressed storage (). + + Args: + *schemas: Schema instances to scan + store_name: Only include references to this store (None = all stores) + verbose: Print progress information + + Returns: + Set of object paths that are referenced + """ + referenced: set[str] = set() + + for schema in schemas: + if verbose: + logger.info(f"Scanning schema for objects: {schema.database}") + + # Get all tables in schema + for table_name in schema.list_tables(): + try: + # Get table class + table = schema.spawn_table(table_name) + + # Check each attribute for object storage + for attr_name, attr in table.heading.attributes.items(): + if not _uses_object_storage(attr): + continue + + if verbose: + logger.info(f" Scanning {table_name}.{attr_name}") + + # Fetch all values for this attribute + try: + values = table.fetch(attr_name) + for value in values: + for path, ref_store in _extract_object_refs(value): + # Filter by store if specified + if store_name is None or ref_store == store_name: + referenced.add(path) + except Exception as e: + logger.warning(f"Error scanning {table_name}.{attr_name}: {e}") + + except Exception as e: + logger.warning(f"Error accessing table {table_name}: {e}") + + return referenced + + def list_stored_content(store_name: str | None = None) -> dict[str, int]: """ List all content hashes in storage. @@ -189,13 +298,94 @@ def list_stored_content(store_name: str | None = None) -> dict[str, int]: return stored +def list_stored_objects(store_name: str | None = None) -> dict[str, int]: + """ + List all object paths in storage. + + Scans for directories matching the object storage pattern: + {schema}/{table}/objects/{pk}/{field}_{token}/ + + Args: + store_name: Store to scan (None = default store) + + Returns: + Dict mapping object_path to size in bytes + """ + backend = get_store_backend(store_name) + stored: dict[str, int] = {} + + try: + # Walk the storage looking for /objects/ directories + full_prefix = backend._full_path("") + + for root, dirs, files in backend.fs.walk(full_prefix): + # Skip _content directory + if "_content" in root: + continue + + # Look for "objects" directory pattern + if "/objects/" in root: + # This could be an object storage path + # Path pattern: {schema}/{table}/objects/{pk}/{field}_{token} + relative_path = root.replace(full_prefix, "").lstrip("/") + + # Calculate total size of this object directory + total_size = 0 + for file in files: + try: + file_path = f"{root}/{file}" + total_size += backend.fs.size(file_path) + except Exception: + pass + + # Only count directories with files (actual objects) + if total_size > 0 or files: + stored[relative_path] = total_size + + except FileNotFoundError: + pass + except Exception as e: + logger.warning(f"Error listing stored objects: {e}") + + return stored + + +def delete_object(path: str, store_name: str | None = None) -> bool: + """ + Delete an object directory from storage. + + Args: + path: Object path (relative to store root) + store_name: Store name (None = default store) + + Returns: + True if deleted, False if not found + """ + backend = get_store_backend(store_name) + + try: + full_path = backend._full_path(path) + if backend.fs.exists(full_path): + # Remove entire directory tree + backend.fs.rm(full_path, recursive=True) + logger.debug(f"Deleted object: {path}") + return True + except Exception as e: + logger.warning(f"Error deleting object {path}: {e}") + + return False + + def scan( *schemas: "Schema", store_name: str | None = None, verbose: bool = False, ) -> dict[str, Any]: """ - Scan for orphaned content without deleting. + Scan for orphaned content and objects without deleting. + + Scans both content-addressed storage (for , , ) + and path-addressed storage (for ). Args: *schemas: Schema instances to scan @@ -204,31 +394,50 @@ def scan( Returns: Dict with scan statistics: - - referenced: Number of content items referenced in database - - stored: Number of content items in storage - - orphaned: Number of unreferenced content items - - orphaned_bytes: Total size of orphaned content + - content_referenced: Number of content items referenced in database + - content_stored: Number of content items in storage + - content_orphaned: Number of unreferenced content items + - content_orphaned_bytes: Total size of orphaned content - orphaned_hashes: List of orphaned content hashes + - object_referenced: Number of objects referenced in database + - object_stored: Number of objects in storage + - object_orphaned: Number of unreferenced objects + - object_orphaned_bytes: Total size of orphaned objects + - orphaned_paths: List of orphaned object paths """ if not schemas: raise DataJointError("At least one schema must be provided") - # Find all referenced content - referenced = scan_references(*schemas, store_name=store_name, verbose=verbose) + # --- Content-addressed storage --- + content_referenced = scan_references(*schemas, store_name=store_name, verbose=verbose) + content_stored = list_stored_content(store_name) + orphaned_hashes = set(content_stored.keys()) - content_referenced + content_orphaned_bytes = sum(content_stored.get(h, 0) for h in orphaned_hashes) - # Find all stored content - stored = list_stored_content(store_name) - - # Find orphaned content - orphaned_hashes = set(stored.keys()) - referenced - orphaned_bytes = sum(stored.get(h, 0) for h in orphaned_hashes) + # --- Path-addressed storage (objects) --- + object_referenced = scan_object_references(*schemas, store_name=store_name, verbose=verbose) + object_stored = list_stored_objects(store_name) + orphaned_paths = set(object_stored.keys()) - object_referenced + object_orphaned_bytes = sum(object_stored.get(p, 0) for p in orphaned_paths) return { - "referenced": len(referenced), - "stored": len(stored), - "orphaned": len(orphaned_hashes), - "orphaned_bytes": orphaned_bytes, + # Content-addressed storage stats + "content_referenced": len(content_referenced), + "content_stored": len(content_stored), + "content_orphaned": len(orphaned_hashes), + "content_orphaned_bytes": content_orphaned_bytes, "orphaned_hashes": sorted(orphaned_hashes), + # Path-addressed storage stats + "object_referenced": len(object_referenced), + "object_stored": len(object_stored), + "object_orphaned": len(orphaned_paths), + "object_orphaned_bytes": object_orphaned_bytes, + "orphaned_paths": sorted(orphaned_paths), + # Combined totals + "referenced": len(content_referenced) + len(object_referenced), + "stored": len(content_stored) + len(object_stored), + "orphaned": len(orphaned_hashes) + len(orphaned_paths), + "orphaned_bytes": content_orphaned_bytes + object_orphaned_bytes, } @@ -239,10 +448,10 @@ def collect( verbose: bool = False, ) -> dict[str, Any]: """ - Remove orphaned content from storage. + Remove orphaned content and objects from storage. - Scans the given schemas for content references, then removes any - content in storage that is not referenced. + Scans the given schemas for content and object references, then removes any + storage items that are not referenced. Args: *schemas: Schema instances to scan @@ -252,43 +461,69 @@ def collect( Returns: Dict with collection statistics: - - referenced: Number of content items referenced in database - - stored: Number of content items in storage - - orphaned: Number of unreferenced content items - - deleted: Number of items deleted (0 if dry_run) + - referenced: Total items referenced in database + - stored: Total items in storage + - orphaned: Total unreferenced items + - content_deleted: Number of content items deleted + - object_deleted: Number of object items deleted + - deleted: Total items deleted (0 if dry_run) - bytes_freed: Bytes freed (0 if dry_run) - errors: Number of deletion errors """ - # First scan to find orphaned content + # First scan to find orphaned content and objects stats = scan(*schemas, store_name=store_name, verbose=verbose) - deleted = 0 + content_deleted = 0 + object_deleted = 0 bytes_freed = 0 errors = 0 - if not dry_run and stats["orphaned"] > 0: - stored = list_stored_content(store_name) - - for content_hash in stats["orphaned_hashes"]: - try: - size = stored.get(content_hash, 0) - if delete_content(content_hash, store_name): - deleted += 1 - bytes_freed += size - if verbose: - logger.info(f"Deleted: {content_hash[:16]}... ({size} bytes)") - except Exception as e: - errors += 1 - logger.warning(f"Failed to delete {content_hash[:16]}...: {e}") + if not dry_run: + # Delete orphaned content (hash-addressed) + if stats["content_orphaned"] > 0: + content_stored = list_stored_content(store_name) + + for content_hash in stats["orphaned_hashes"]: + try: + size = content_stored.get(content_hash, 0) + if delete_content(content_hash, store_name): + content_deleted += 1 + bytes_freed += size + if verbose: + logger.info(f"Deleted content: {content_hash[:16]}... ({size} bytes)") + except Exception as e: + errors += 1 + logger.warning(f"Failed to delete content {content_hash[:16]}...: {e}") + + # Delete orphaned objects (path-addressed) + if stats["object_orphaned"] > 0: + object_stored = list_stored_objects(store_name) + + for path in stats["orphaned_paths"]: + try: + size = object_stored.get(path, 0) + if delete_object(path, store_name): + object_deleted += 1 + bytes_freed += size + if verbose: + logger.info(f"Deleted object: {path} ({size} bytes)") + except Exception as e: + errors += 1 + logger.warning(f"Failed to delete object {path}: {e}") return { "referenced": stats["referenced"], "stored": stats["stored"], "orphaned": stats["orphaned"], - "deleted": deleted, + "content_deleted": content_deleted, + "object_deleted": object_deleted, + "deleted": content_deleted + object_deleted, "bytes_freed": bytes_freed, "errors": errors, "dry_run": dry_run, + # Include detailed stats + "content_orphaned": stats["content_orphaned"], + "object_orphaned": stats["object_orphaned"], } @@ -302,23 +537,52 @@ def format_stats(stats: dict[str, Any]) -> str: Returns: Formatted string """ - lines = [ - "Content Storage Statistics:", - f" Referenced in database: {stats['referenced']}", - f" Stored in backend: {stats['stored']}", - f" Orphaned (unreferenced): {stats['orphaned']}", - ] + lines = ["External Storage Statistics:"] + + # Show content-addressed storage stats if present + if "content_referenced" in stats: + lines.append("") + lines.append("Content-Addressed Storage (, , ):") + lines.append(f" Referenced: {stats['content_referenced']}") + lines.append(f" Stored: {stats['content_stored']}") + lines.append(f" Orphaned: {stats['content_orphaned']}") + if "content_orphaned_bytes" in stats: + size_mb = stats["content_orphaned_bytes"] / (1024 * 1024) + lines.append(f" Orphaned size: {size_mb:.2f} MB") + + # Show path-addressed storage stats if present + if "object_referenced" in stats: + lines.append("") + lines.append("Path-Addressed Storage ():") + lines.append(f" Referenced: {stats['object_referenced']}") + lines.append(f" Stored: {stats['object_stored']}") + lines.append(f" Orphaned: {stats['object_orphaned']}") + if "object_orphaned_bytes" in stats: + size_mb = stats["object_orphaned_bytes"] / (1024 * 1024) + lines.append(f" Orphaned size: {size_mb:.2f} MB") + + # Show totals + lines.append("") + lines.append("Totals:") + lines.append(f" Referenced in database: {stats['referenced']}") + lines.append(f" Stored in backend: {stats['stored']}") + lines.append(f" Orphaned (unreferenced): {stats['orphaned']}") if "orphaned_bytes" in stats: size_mb = stats["orphaned_bytes"] / (1024 * 1024) lines.append(f" Orphaned size: {size_mb:.2f} MB") + # Show deletion results if this is from collect() if "deleted" in stats: lines.append("") if stats.get("dry_run", True): lines.append(" [DRY RUN - no changes made]") else: lines.append(f" Deleted: {stats['deleted']}") + if "content_deleted" in stats: + lines.append(f" Content: {stats['content_deleted']}") + if "object_deleted" in stats: + lines.append(f" Objects: {stats['object_deleted']}") freed_mb = stats["bytes_freed"] / (1024 * 1024) lines.append(f" Bytes freed: {freed_mb:.2f} MB") if stats.get("errors", 0) > 0: diff --git a/tests/test_gc.py b/tests/test_gc.py index 5af71a0a9..2c312bcc0 100644 --- a/tests/test_gc.py +++ b/tests/test_gc.py @@ -85,6 +85,61 @@ def test_returns_empty_for_dict_without_hash(self): assert gc._extract_content_refs({"other": "data"}) == [] +class TestUsesObjectStorage: + """Tests for _uses_object_storage helper function.""" + + def test_returns_false_for_no_adapter(self): + """Test that False is returned when attribute has no adapter.""" + attr = MagicMock() + attr.adapter = None + + assert gc._uses_object_storage(attr) is False + + def test_returns_true_for_object_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "object" + + assert gc._uses_object_storage(attr) is True + + def test_returns_false_for_other_types(self): + """Test that False is returned for non-object types.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "xblob" + + assert gc._uses_object_storage(attr) is False + + +class TestExtractObjectRefs: + """Tests for _extract_object_refs helper function.""" + + def test_returns_empty_for_none(self): + """Test that empty list is returned for None value.""" + assert gc._extract_object_refs(None) == [] + + def test_parses_json_string(self): + """Test parsing JSON string with path.""" + value = '{"path": "schema/table/objects/pk/field_abc123", "store": "mystore"}' + refs = gc._extract_object_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("schema/table/objects/pk/field_abc123", "mystore") + + def test_parses_dict_directly(self): + """Test parsing dict with path.""" + value = {"path": "test/path", "store": None} + refs = gc._extract_object_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("test/path", None) + + def test_returns_empty_for_dict_without_path(self): + """Test that empty list is returned for dict without path key.""" + assert gc._extract_object_refs({"other": "data"}) == [] + + class TestScan: """Tests for scan function.""" @@ -93,28 +148,47 @@ def test_requires_at_least_one_schema(self): with pytest.raises(DataJointError, match="At least one schema must be provided"): gc.scan() + @patch("datajoint.gc.scan_object_references") + @patch("datajoint.gc.list_stored_objects") @patch("datajoint.gc.scan_references") @patch("datajoint.gc.list_stored_content") - def test_returns_stats(self, mock_list_stored, mock_scan_refs): + def test_returns_stats(self, mock_list_content, mock_scan_refs, mock_list_objects, mock_scan_objects): """Test that scan returns proper statistics.""" - # Mock referenced hashes + # Mock content-addressed storage mock_scan_refs.return_value = {"hash1", "hash2"} - - # Mock stored content (hash1 referenced, hash3 orphaned) - mock_list_stored.return_value = { + mock_list_content.return_value = { "hash1": 100, - "hash3": 200, + "hash3": 200, # orphaned + } + + # Mock path-addressed storage + mock_scan_objects.return_value = {"path/to/obj1"} + mock_list_objects.return_value = { + "path/to/obj1": 500, + "path/to/obj2": 300, # orphaned } mock_schema = MagicMock() stats = gc.scan(mock_schema, store_name="test_store") - assert stats["referenced"] == 2 - assert stats["stored"] == 2 - assert stats["orphaned"] == 1 - assert stats["orphaned_bytes"] == 200 + # Content stats + assert stats["content_referenced"] == 2 + assert stats["content_stored"] == 2 + assert stats["content_orphaned"] == 1 assert "hash3" in stats["orphaned_hashes"] + # Object stats + assert stats["object_referenced"] == 1 + assert stats["object_stored"] == 2 + assert stats["object_orphaned"] == 1 + assert "path/to/obj2" in stats["orphaned_paths"] + + # Combined totals + assert stats["referenced"] == 3 + assert stats["stored"] == 4 + assert stats["orphaned"] == 2 + assert stats["orphaned_bytes"] == 500 # 200 content + 300 object + class TestCollect: """Tests for collect function.""" @@ -128,6 +202,9 @@ def test_dry_run_does_not_delete(self, mock_scan): "orphaned": 1, "orphaned_bytes": 100, "orphaned_hashes": ["orphan_hash"], + "orphaned_paths": [], + "content_orphaned": 1, + "object_orphaned": 0, } mock_schema = MagicMock() @@ -148,6 +225,9 @@ def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete "orphaned": 1, "orphaned_bytes": 100, "orphaned_hashes": ["orphan_hash"], + "orphaned_paths": [], + "content_orphaned": 1, + "object_orphaned": 0, } mock_list_stored.return_value = {"orphan_hash": 100} mock_delete.return_value = True @@ -156,10 +236,38 @@ def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) assert stats["deleted"] == 1 + assert stats["content_deleted"] == 1 assert stats["bytes_freed"] == 100 assert stats["dry_run"] is False mock_delete.assert_called_once_with("orphan_hash", "test_store") + @patch("datajoint.gc.delete_object") + @patch("datajoint.gc.list_stored_objects") + @patch("datajoint.gc.scan") + def test_deletes_orphaned_objects(self, mock_scan, mock_list_objects, mock_delete): + """Test that orphaned objects are deleted when dry_run=False.""" + mock_scan.return_value = { + "referenced": 1, + "stored": 2, + "orphaned": 1, + "orphaned_bytes": 500, + "orphaned_hashes": [], + "orphaned_paths": ["path/to/orphan"], + "content_orphaned": 0, + "object_orphaned": 1, + } + mock_list_objects.return_value = {"path/to/orphan": 500} + mock_delete.return_value = True + + mock_schema = MagicMock() + stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) + + assert stats["deleted"] == 1 + assert stats["object_deleted"] == 1 + assert stats["bytes_freed"] == 500 + assert stats["dry_run"] is False + mock_delete.assert_called_once_with("path/to/orphan", "test_store") + class TestFormatStats: """Tests for format_stats function.""" @@ -171,6 +279,14 @@ def test_formats_scan_stats(self): "stored": 15, "orphaned": 5, "orphaned_bytes": 1024 * 1024, # 1 MB + "content_referenced": 6, + "content_stored": 8, + "content_orphaned": 2, + "content_orphaned_bytes": 512 * 1024, + "object_referenced": 4, + "object_stored": 7, + "object_orphaned": 3, + "object_orphaned_bytes": 512 * 1024, } result = gc.format_stats(stats) @@ -179,6 +295,9 @@ def test_formats_scan_stats(self): assert "Stored in backend: 15" in result assert "Orphaned (unreferenced): 5" in result assert "1.00 MB" in result + # Check for detailed sections + assert "Content-Addressed Storage" in result + assert "Path-Addressed Storage" in result def test_formats_collect_stats_dry_run(self): """Test formatting collect statistics with dry_run.""" @@ -202,6 +321,8 @@ def test_formats_collect_stats_actual(self): "stored": 15, "orphaned": 5, "deleted": 3, + "content_deleted": 2, + "object_deleted": 1, "bytes_freed": 2 * 1024 * 1024, # 2 MB "errors": 2, "dry_run": False, @@ -210,5 +331,7 @@ def test_formats_collect_stats_actual(self): result = gc.format_stats(stats) assert "Deleted: 3" in result + assert "Content: 2" in result + assert "Objects: 1" in result assert "2.00 MB" in result assert "Errors: 2" in result From 3fc00ee539352b7bd0ea5c44e5f8e14e4ad9d18b Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 03:30:30 +0000 Subject: [PATCH 40/42] Move EXTERNAL_TABLE_ROOT to external.py (deprecated) External tables are deprecated in favor of the new storage type system. Move the constant to external.py where it's used, keeping declare.py clean. Co-authored-by: dimitri-yatsenko --- src/datajoint/external.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datajoint/external.py b/src/datajoint/external.py index 06e76af37..81230fb95 100644 --- a/src/datajoint/external.py +++ b/src/datajoint/external.py @@ -5,15 +5,18 @@ from tqdm import tqdm -from .declare import EXTERNAL_TABLE_ROOT from .errors import DataJointError, MissingExternalFile from .hash import uuid_from_buffer, uuid_from_file from .heading import Heading from .settings import config from .storage import StorageBackend from .table import FreeTable, Table + from .utils import safe_write +# External table name root (deprecated - external tables are being phased out) +EXTERNAL_TABLE_ROOT = "~external" + logger = logging.getLogger(__name__.split(".")[0]) CACHE_SUBFOLDING = ( From b4512c9fd7289e911d7c93056495fa3ad79264e1 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 03:35:50 +0000 Subject: [PATCH 41/42] Remove deprecated external.py module External tables (~external_*) are deprecated in favor of the new AttributeType-based storage system. The new types (, , ) store data directly to storage via StorageBackend without tracking tables. - Remove src/datajoint/external.py entirely - Remove ExternalMapping from schemas.py - Remove external table pre-declaration from table.py Co-authored-by: dimitri-yatsenko --- src/datajoint/external.py | 455 -------------------------------------- src/datajoint/schemas.py | 2 - src/datajoint/table.py | 12 +- 3 files changed, 3 insertions(+), 466 deletions(-) delete mode 100644 src/datajoint/external.py diff --git a/src/datajoint/external.py b/src/datajoint/external.py deleted file mode 100644 index 81230fb95..000000000 --- a/src/datajoint/external.py +++ /dev/null @@ -1,455 +0,0 @@ -import logging -import warnings -from collections.abc import Mapping -from pathlib import Path, PurePosixPath, PureWindowsPath - -from tqdm import tqdm - -from .errors import DataJointError, MissingExternalFile -from .hash import uuid_from_buffer, uuid_from_file -from .heading import Heading -from .settings import config -from .storage import StorageBackend -from .table import FreeTable, Table - -from .utils import safe_write - -# External table name root (deprecated - external tables are being phased out) -EXTERNAL_TABLE_ROOT = "~external" - -logger = logging.getLogger(__name__.split(".")[0]) - -CACHE_SUBFOLDING = ( - 2, - 2, -) # (2, 2) means "0123456789abcd" will be saved as "01/23/0123456789abcd" -SUPPORT_MIGRATED_BLOBS = True # support blobs migrated from datajoint 0.11.* - - -def subfold(name, folds): - """ - subfolding for external storage: e.g. subfold('aBCdefg', (2, 3)) --> ['ab','cde'] - """ - return (name[: folds[0]].lower(),) + subfold(name[folds[0] :], folds[1:]) if folds else () - - -class ExternalTable(Table): - """ - The table tracking externally stored objects. - Declare as ExternalTable(connection, database) - """ - - def __init__(self, connection, store, database): - self.store = store - self.database = database - self._connection = connection - self._heading = Heading( - table_info=dict( - conn=connection, - database=database, - table_name=self.table_name, - context=None, - ) - ) - self._support = [self.full_table_name] - if not self.is_declared: - self.declare() - # Initialize storage backend (validates configuration) - self.storage = StorageBackend(config.get_store_spec(store)) - - @property - def definition(self): - return """ - # external storage tracking - hash : uuid # hash of contents (blob), of filename + contents (attach), or relative filepath (filepath) - --- - size :bigint unsigned # size of object in bytes - attachment_name=null : varchar(255) # the filename of an attachment - filepath=null : varchar(1000) # relative filepath or attachment filename - contents_hash=null : uuid # used for the filepath datatype - timestamp=CURRENT_TIMESTAMP :timestamp # automatic timestamp - """ - - @property - def table_name(self): - return f"{EXTERNAL_TABLE_ROOT}_{self.store}" - - @property - def s3(self): - """Deprecated: Use storage property instead.""" - warnings.warn( - "ExternalTable.s3 is deprecated. Use ExternalTable.storage instead.", - DeprecationWarning, - stacklevel=2, - ) - # For backward compatibility, return a legacy s3.Folder if needed - from . import s3 - - if not hasattr(self, "_s3_legacy") or self._s3_legacy is None: - self._s3_legacy = s3.Folder(**self.storage.spec) - return self._s3_legacy - - # - low-level operations - private - - def _make_external_filepath(self, relative_filepath): - """resolve the complete external path based on the relative path""" - spec = self.storage.spec - # Strip root for S3 paths - if spec["protocol"] == "s3": - posix_path = PurePosixPath(PureWindowsPath(spec["location"])) - location_path = ( - Path(*posix_path.parts[1:]) - if len(spec["location"]) > 0 and any(case in posix_path.parts[0] for case in ("\\", ":")) - else Path(posix_path) - ) - return PurePosixPath(location_path, relative_filepath) - # Preserve root for local filesystem - elif spec["protocol"] == "file": - return PurePosixPath(Path(spec["location"]), relative_filepath) - else: - # For other protocols (gcs, azure, etc.), treat like S3 - location = spec.get("location", "") - return PurePosixPath(location, relative_filepath) if location else PurePosixPath(relative_filepath) - - def _make_uuid_path(self, uuid, suffix=""): - """create external path based on the uuid hash""" - return self._make_external_filepath( - PurePosixPath( - self.database, - "/".join(subfold(uuid.hex, self.storage.spec["subfolding"])), - uuid.hex, - ).with_suffix(suffix) - ) - - def _upload_file(self, local_path, external_path, metadata=None): - """Upload a file to external storage using fsspec backend.""" - self.storage.put_file(local_path, external_path, metadata) - - def _download_file(self, external_path, download_path): - """Download a file from external storage using fsspec backend.""" - self.storage.get_file(external_path, download_path) - - def _upload_buffer(self, buffer, external_path): - """Upload bytes to external storage using fsspec backend.""" - self.storage.put_buffer(buffer, external_path) - - def _download_buffer(self, external_path): - """Download bytes from external storage using fsspec backend.""" - return self.storage.get_buffer(external_path) - - def _remove_external_file(self, external_path): - """Remove a file from external storage using fsspec backend.""" - self.storage.remove(external_path) - - def exists(self, external_filepath): - """ - Check if an external file is accessible using fsspec backend. - - :return: True if the external file is accessible - """ - return self.storage.exists(external_filepath) - - # --- BLOBS ---- - - def put(self, blob): - """ - put a binary string (blob) in external store - """ - uuid = uuid_from_buffer(blob) - self._upload_buffer(blob, self._make_uuid_path(uuid)) - # insert tracking info - self.connection.query( - "INSERT INTO {tab} (hash, size) VALUES (%s, {size}) ON DUPLICATE KEY UPDATE timestamp=CURRENT_TIMESTAMP".format( - tab=self.full_table_name, size=len(blob) - ), - args=(uuid.bytes,), - ) - return uuid - - def get(self, uuid): - """ - get an object from external store. - """ - if uuid is None: - return None - # attempt to get object from cache - blob = None - cache_folder = config.get("cache", None) - if cache_folder: - try: - cache_path = Path(cache_folder, *subfold(uuid.hex, CACHE_SUBFOLDING)) - cache_file = Path(cache_path, uuid.hex) - blob = cache_file.read_bytes() - except FileNotFoundError: - pass # not cached - # download blob from external store - if blob is None: - try: - blob = self._download_buffer(self._make_uuid_path(uuid)) - except MissingExternalFile: - if not SUPPORT_MIGRATED_BLOBS: - raise - # blobs migrated from datajoint 0.11 are stored at explicitly defined filepaths - relative_filepath, contents_hash = (self & {"hash": uuid}).fetch1("filepath", "contents_hash") - if relative_filepath is None: - raise - blob = self._download_buffer(self._make_external_filepath(relative_filepath)) - if cache_folder: - cache_path.mkdir(parents=True, exist_ok=True) - safe_write(cache_path / uuid.hex, blob) - return blob - - # --- ATTACHMENTS --- - - def upload_attachment(self, local_path): - attachment_name = Path(local_path).name - uuid = uuid_from_file(local_path, init_string=attachment_name + "\0") - external_path = self._make_uuid_path(uuid, "." + attachment_name) - self._upload_file(local_path, external_path) - # insert tracking info - self.connection.query( - """ - INSERT INTO {tab} (hash, size, attachment_name) - VALUES (%s, {size}, "{attachment_name}") - ON DUPLICATE KEY UPDATE timestamp=CURRENT_TIMESTAMP""".format( - tab=self.full_table_name, - size=Path(local_path).stat().st_size, - attachment_name=attachment_name, - ), - args=[uuid.bytes], - ) - return uuid - - def get_attachment_name(self, uuid): - return (self & {"hash": uuid}).fetch1("attachment_name") - - def download_attachment(self, uuid, attachment_name, download_path): - """save attachment from memory buffer into the save_path""" - external_path = self._make_uuid_path(uuid, "." + attachment_name) - self._download_file(external_path, download_path) - - # --- FILEPATH --- - - def upload_filepath(self, local_filepath): - """ - Raise exception if an external entry already exists with a different contents checksum. - Otherwise, copy (with overwrite) file to remote and - If an external entry exists with the same checksum, then no copying should occur - """ - local_filepath = Path(local_filepath) - try: - relative_filepath = str(local_filepath.relative_to(self.storage.spec["stage"]).as_posix()) - except ValueError: - raise DataJointError(f"The path {local_filepath.parent} is not in stage {self.storage.spec['stage']}") - uuid = uuid_from_buffer(init_string=relative_filepath) # hash relative path, not contents - contents_hash = uuid_from_file(local_filepath) - - # check if the remote file already exists and verify that it matches - check_hash = (self & {"hash": uuid}).fetch("contents_hash") - if check_hash.size: - # the tracking entry exists, check that it's the same file as before - if contents_hash != check_hash[0]: - raise DataJointError(f"A different version of '{relative_filepath}' has already been placed.") - else: - # upload the file and create its tracking entry - self._upload_file( - local_filepath, - self._make_external_filepath(relative_filepath), - metadata={"contents_hash": str(contents_hash)}, - ) - self.connection.query( - "INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format( - tab=self.full_table_name, - size=Path(local_filepath).stat().st_size, - filepath=relative_filepath, - ), - args=(uuid.bytes, contents_hash.bytes), - ) - return uuid - - def download_filepath(self, filepath_hash): - """ - sync a file from external store to the local stage - - :param filepath_hash: The hash (UUID) of the relative_path - :return: hash (UUID) of the contents of the downloaded file or Nones - """ - - def _need_checksum(local_filepath, expected_size): - limit = config.get("filepath_checksum_size_limit") - actual_size = Path(local_filepath).stat().st_size - if expected_size != actual_size: - # this should never happen without outside interference - raise DataJointError(f"'{local_filepath}' downloaded but size did not match.") - return limit is None or actual_size < limit - - if filepath_hash is not None: - relative_filepath, contents_hash, size = (self & {"hash": filepath_hash}).fetch1( - "filepath", "contents_hash", "size" - ) - external_path = self._make_external_filepath(relative_filepath) - local_filepath = Path(self.storage.spec["stage"]).absolute() / relative_filepath - - file_exists = Path(local_filepath).is_file() and ( - not _need_checksum(local_filepath, size) or uuid_from_file(local_filepath) == contents_hash - ) - - if not file_exists: - self._download_file(external_path, local_filepath) - if _need_checksum(local_filepath, size) and uuid_from_file(local_filepath) != contents_hash: - # this should never happen without outside interference - raise DataJointError(f"'{local_filepath}' downloaded but did not pass checksum.") - if not _need_checksum(local_filepath, size): - logger.warning(f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}") - return str(local_filepath), contents_hash - - # --- UTILITIES --- - - @property - def references(self): - """ - :return: generator of referencing table names and their referencing columns - """ - return ( - {k.lower(): v for k, v in elem.items()} - for elem in self.connection.query( - """ - SELECT concat('`', table_schema, '`.`', table_name, '`') as referencing_table, column_name - FROM information_schema.key_column_usage - WHERE referenced_table_name="{tab}" and referenced_table_schema="{db}" - """.format(tab=self.table_name, db=self.database), - as_dict=True, - ) - ) - - def fetch_external_paths(self, **fetch_kwargs): - """ - generate complete external filepaths from the query. - Each element is a tuple: (uuid, path) - - :param fetch_kwargs: keyword arguments to pass to fetch - """ - fetch_kwargs.update(as_dict=True) - paths = [] - for item in self.fetch("hash", "attachment_name", "filepath", **fetch_kwargs): - if item["attachment_name"]: - # attachments - path = self._make_uuid_path(item["hash"], "." + item["attachment_name"]) - elif item["filepath"]: - # external filepaths - path = self._make_external_filepath(item["filepath"]) - else: - # blobs - path = self._make_uuid_path(item["hash"]) - paths.append((item["hash"], path)) - return paths - - def unused(self): - """ - query expression for unused hashes - - :return: self restricted to elements that are not in use by any tables in the schema - """ - return self - [ - FreeTable(self.connection, ref["referencing_table"]).proj(hash=ref["column_name"]) for ref in self.references - ] - - def used(self): - """ - query expression for used hashes - - :return: self restricted to elements that in use by tables in the schema - """ - return self & [ - FreeTable(self.connection, ref["referencing_table"]).proj(hash=ref["column_name"]) for ref in self.references - ] - - def delete( - self, - *, - delete_external_files=None, - limit=None, - display_progress=True, - errors_as_string=True, - ): - """ - - :param delete_external_files: True or False. If False, only the tracking info is removed from the external - store table but the external files remain intact. If True, then the external files themselves are deleted too. - :param errors_as_string: If True any errors returned when deleting from external files will be strings - :param limit: (integer) limit the number of items to delete - :param display_progress: if True, display progress as files are cleaned up - :return: if deleting external files, returns errors - """ - if delete_external_files not in (True, False): - raise DataJointError("The delete_external_files argument must be set to either True or False in delete()") - - if not delete_external_files: - self.unused().delete_quick() - else: - items = self.unused().fetch_external_paths(limit=limit) - if display_progress: - items = tqdm(items) - # delete items one by one, close to transaction-safe - error_list = [] - for uuid, external_path in items: - row = (self & {"hash": uuid}).fetch() - if row.size: - try: - (self & {"hash": uuid}).delete_quick() - except Exception: - pass # if delete failed, do not remove the external file - else: - try: - self._remove_external_file(external_path) - except Exception as error: - # adding row back into table after failed delete - self.insert1(row[0], skip_duplicates=True) - error_list.append( - ( - uuid, - external_path, - str(error) if errors_as_string else error, - ) - ) - return error_list - - -class ExternalMapping(Mapping): - """ - The external manager contains all the tables for all external stores for a given schema - :Example: - e = ExternalMapping(schema) - external_table = e[store] - """ - - def __init__(self, schema): - self.schema = schema - self._tables = {} - - def __repr__(self): - return "External file tables for schema `{schema}`:\n ".format(schema=self.schema.database) + "\n ".join( - '"{store}" {protocol}:{location}'.format(store=k, **v.spec) for k, v in self.items() - ) - - def __getitem__(self, store): - """ - Triggers the creation of an external table. - Should only be used when ready to save or read from external storage. - - :param store: the name of the store - :return: the ExternalTable object for the store - """ - if store not in self._tables: - self._tables[store] = ExternalTable( - connection=self.schema.connection, - store=store, - database=self.schema.database, - ) - return self._tables[store] - - def __len__(self): - return len(self._tables) - - def __iter__(self): - return iter(self._tables) diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py index e9b83efff..0b42f0104 100644 --- a/src/datajoint/schemas.py +++ b/src/datajoint/schemas.py @@ -8,7 +8,6 @@ from .connection import conn from .errors import AccessError, DataJointError -from .external import ExternalMapping from .heading import Heading from .jobs import JobTable from .settings import config @@ -71,7 +70,6 @@ def __init__( self.create_schema = create_schema self.create_tables = create_tables self._jobs = None - self.external = ExternalMapping(self) self.add_objects = add_objects self.declare_list = [] if schema_name: diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 009d475d2..dce1e70ab 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -102,12 +102,9 @@ def declare(self, context=None): "Table class name `{name}` is invalid. Please use CamelCase. ".format(name=self.class_name) + "Classes defining tables should be formatted in strict CamelCase." ) - sql, external_stores = declare(self.full_table_name, self.definition, context) + sql, _external_stores = declare(self.full_table_name, self.definition, context) sql = sql.format(database=self.database) try: - # declare all external tables before declaring main table - for store in external_stores: - self.connection.schemas[self.database].external[store] self.connection.query(sql) except AccessError: # skip if no create privilege @@ -126,7 +123,7 @@ def alter(self, prompt=True, context=None): context = dict(frame.f_globals, **frame.f_locals) del frame old_definition = self.describe(context=context) - sql, external_stores = alter(self.definition, old_definition, context) + sql, _external_stores = alter(self.definition, old_definition, context) if not sql: if prompt: logger.warning("Nothing to alter.") @@ -134,9 +131,6 @@ def alter(self, prompt=True, context=None): sql = "ALTER TABLE {tab}\n\t".format(tab=self.full_table_name) + ",\n\t".join(sql) if not prompt or user_choice(sql + "\n\nExecute?") == "yes": try: - # declare all external tables before declaring main table - for store in external_stores: - self.connection.schemas[self.database].external[store] self.connection.query(sql) except AccessError: # skip if no create privilege @@ -351,7 +345,7 @@ def _process_object_value(self, name: str, value, row: dict, store_name: str | N size = source_path.stat().st_size else: raise DataJointError( - f"Invalid value type for object attribute {name}. " "Expected file path, folder path, or (ext, stream) tuple." + f"Invalid value type for object attribute {name}. Expected file path, folder path, or (ext, stream) tuple." ) # Get storage spec for path building From c951ee550626c1e589a77e3b5c0e77b71bb09167 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 30 Dec 2025 17:47:17 +0000 Subject: [PATCH 42/42] Replace ClassProperty with metaclass properties Python 3.10+ doesn't have a built-in class property decorator (the @classmethod + @property chaining was deprecated in 3.11). The modern approach is to define properties on the metaclass, which automatically makes them work at the class level. - Move connection, table_name, full_table_name properties to TableMeta - Create PartMeta subclass with overridden properties for Part tables - Remove ClassProperty class from utils.py Co-authored-by: dimitri-yatsenko --- src/datajoint/user_tables.py | 84 ++++++++++++++++++------------------ src/datajoint/utils.py | 8 ---- 2 files changed, 43 insertions(+), 49 deletions(-) diff --git a/src/datajoint/user_tables.py b/src/datajoint/user_tables.py index d7faeb285..fa26bc9c6 100644 --- a/src/datajoint/user_tables.py +++ b/src/datajoint/user_tables.py @@ -7,7 +7,7 @@ from .autopopulate import AutoPopulate from .errors import DataJointError from .table import Table -from .utils import ClassProperty, from_camel_case +from .utils import from_camel_case _base_regexp = r"[a-z][a-z0-9]*(_[a-z][a-z0-9]*)*" @@ -78,6 +78,26 @@ def __add__(cls, arg): def __iter__(cls): return iter(cls()) + # Class properties - defined on metaclass to work at class level + @property + def connection(cls): + """The database connection for this table.""" + return cls._connection + + @property + def table_name(cls): + """The table name formatted for MySQL.""" + if cls._prefix is None: + raise AttributeError("Class prefix is not defined!") + return cls._prefix + from_camel_case(cls.__name__) + + @property + def full_table_name(cls): + """The fully qualified table name (`database`.`table`).""" + if cls.database is None: + return None + return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) + class UserTable(Table, metaclass=TableMeta): """ @@ -101,27 +121,6 @@ def definition(self): """ raise NotImplementedError('Subclasses of Table must implement the property "definition"') - @ClassProperty - def connection(cls): - return cls._connection - - @ClassProperty - def table_name(cls): - """ - :return: the table name of the table formatted for mysql. - """ - if cls._prefix is None: - raise AttributeError("Class prefix is not defined!") - return cls._prefix + from_camel_case(cls.__name__) - - @ClassProperty - def full_table_name(cls): - if cls not in {Manual, Imported, Lookup, Computed, Part, UserTable}: - # for derived classes only - if cls.database is None: - raise DataJointError("Class %s is not properly declared (schema decorator not applied?)" % cls.__name__) - return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) - class Manual(UserTable): """ @@ -163,7 +162,28 @@ class Computed(UserTable, AutoPopulate): tier_regexp = r"(?P" + _prefix + _base_regexp + ")" -class Part(UserTable): +class PartMeta(TableMeta): + """Metaclass for Part tables with overridden class properties.""" + + @property + def table_name(cls): + """The table name for a Part is derived from its master table.""" + return None if cls.master is None else cls.master.table_name + "__" + from_camel_case(cls.__name__) + + @property + def full_table_name(cls): + """The fully qualified table name (`database`.`table`).""" + if cls.database is None or cls.table_name is None: + return None + return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) + + @property + def master(cls): + """The master table for this Part table.""" + return cls._master + + +class Part(UserTable, metaclass=PartMeta): """ Inherit from this class if the table's values are details of an entry in another table and if this table is populated by the other table. For example, the entries inheriting from @@ -184,24 +204,6 @@ class Part(UserTable): + ")" ) - @ClassProperty - def connection(cls): - return cls._connection - - @ClassProperty - def full_table_name(cls): - return ( - None if cls.database is None or cls.table_name is None else r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) - ) - - @ClassProperty - def master(cls): - return cls._master - - @ClassProperty - def table_name(cls): - return None if cls.master is None else cls.master.table_name + "__" + from_camel_case(cls.__name__) - def delete(self, force=False): """ unless force is True, prohibits direct deletes from parts. diff --git a/src/datajoint/utils.py b/src/datajoint/utils.py index 16927965e..e8303a993 100644 --- a/src/datajoint/utils.py +++ b/src/datajoint/utils.py @@ -7,14 +7,6 @@ from .errors import DataJointError -class ClassProperty: - def __init__(self, f): - self.f = f - - def __get__(self, obj, owner): - return self.f(owner) - - def user_choice(prompt, choices=("yes", "no"), default=None): """ Prompts the user for confirmation. The default value, if any, is capitalized.