diff --git a/docs/src/compute/key-source.md b/docs/src/compute/key-source.md index 76796ec0c..c9b5d2ce7 100644 --- a/docs/src/compute/key-source.md +++ b/docs/src/compute/key-source.md @@ -45,7 +45,7 @@ definition = """ -> Recording --- sample_rate : float -eeg_data : longblob +eeg_data : """ key_source = Recording & 'recording_type = "EEG"' ``` diff --git a/docs/src/compute/make.md b/docs/src/compute/make.md index 1b5569b65..390be3b7b 100644 --- a/docs/src/compute/make.md +++ b/docs/src/compute/make.md @@ -152,7 +152,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ @@ -188,7 +188,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ diff --git a/docs/src/compute/populate.md b/docs/src/compute/populate.md index 45c863f17..91db7b176 100644 --- a/docs/src/compute/populate.md +++ b/docs/src/compute/populate.md @@ -40,7 +40,7 @@ class FilteredImage(dj.Computed): # Filtered image -> Image --- - filtered_image : longblob + filtered_image : """ def make(self, key): @@ -196,7 +196,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ @@ -230,7 +230,7 @@ class ImageAnalysis(dj.Computed): # Complex image analysis results -> Image --- - analysis_result : longblob + analysis_result : processing_time : float """ diff --git a/docs/src/design/integrity.md b/docs/src/design/integrity.md index cb7122755..393103522 100644 --- a/docs/src/design/integrity.md +++ b/docs/src/design/integrity.md @@ -142,7 +142,7 @@ definition = """ -> EEGRecording channel_idx : int --- -channel_data : longblob +channel_data : """ ``` ![doc_1-many](../images/doc_1-many.png){: style="align:center"} diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index f3877cec9..2e8105e7c 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -48,9 +48,10 @@ fractional digits. Because of its well-defined precision, `decimal` values can be used in equality comparison and be included in primary keys. -- `longblob`: arbitrary numeric array (e.g. matrix, image, structure), up to 4 +- `longblob`: raw binary data, up to 4 [GiB](http://en.wikipedia.org/wiki/Gibibyte) in size. - Numeric arrays are compatible between MATLAB and Python (NumPy). + Stores and returns raw bytes without serialization. + For serialized Python objects (arrays, dicts, etc.), use `` instead. The `longblob` and other `blob` datatypes can be configured to store data [externally](../../sysadmin/external-store.md) by using the `blob@store` syntax. @@ -71,6 +72,10 @@ info). These types abstract certain kinds of non-database data to facilitate use together with DataJoint. +- ``: DataJoint's native serialization format for Python objects. Supports +NumPy arrays, dicts, lists, datetime objects, and nested structures. Compatible with +MATLAB. See [custom types](customtype.md) for details. + - `object`: managed [file and folder storage](object.md) with support for direct writes (Zarr, HDF5) and fsspec integration. Recommended for new pipelines. @@ -80,6 +85,10 @@ sending/receiving an opaque data file to/from a DataJoint pipeline. - `filepath@store`: a [filepath](filepath.md) used to link non-DataJoint managed files into a DataJoint pipeline. +- ``: a [custom attribute type](customtype.md) that defines bidirectional +conversion between Python objects and database storage formats. Use this to store +complex data types like graphs, domain-specific objects, or custom data structures. + ## Numeric type aliases DataJoint provides convenient type aliases that map to standard MySQL numeric types. diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md index aad194ff5..267e0420b 100644 --- a/docs/src/design/tables/customtype.md +++ b/docs/src/design/tables/customtype.md @@ -1,4 +1,4 @@ -# Custom Types +# Custom Attribute Types In modern scientific research, data pipelines often involve complex workflows that generate diverse data types. From high-dimensional imaging data to machine learning @@ -12,69 +12,603 @@ traditional relational databases. For example: + Computational biologists might store fitted machine learning models or parameter objects for downstream predictions. -To handle these diverse needs, DataJoint provides the `dj.AttributeAdapter` method. It +To handle these diverse needs, DataJoint provides the **AttributeType** system. It enables researchers to store and retrieve complex, non-standard data types—like Python objects or data structures—in a relational database while maintaining the reproducibility, modularity, and query capabilities required for scientific workflows. -## Uses in Scientific Research +## Overview -Imagine a neuroscience lab studying neural connectivity. Researchers might generate -graphs (e.g., networkx.Graph) to represent connections between brain regions, where: +Custom attribute types define bidirectional conversion between: -+ Nodes are brain regions. -+ Edges represent connections weighted by signal strength or another metric. +- **Python objects** (what your code works with) +- **Storage format** (what gets stored in the database) -Storing these graph objects in a database alongside other experimental data (e.g., -subject metadata, imaging parameters) ensures: +``` +┌─────────────────┐ encode() ┌─────────────────┐ +│ Python Object │ ───────────────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. blob) │ +└─────────────────┘ decode() └─────────────────┘ + ◄─────────────── +``` + +## Defining Custom Types + +Create a custom type by subclassing `dj.AttributeType` and implementing the required +methods: + +```python +import datajoint as dj +import networkx as nx + +@dj.register_type +class GraphType(dj.AttributeType): + """Custom type for storing networkx graphs.""" + + # Required: unique identifier used in table definitions + type_name = "graph" + + # Required: underlying DataJoint storage type + dtype = "longblob" + + def encode(self, graph, *, key=None): + """Convert graph to storable format (called on INSERT).""" + return list(graph.edges) + + def decode(self, edges, *, key=None): + """Convert stored data back to graph (called on FETCH).""" + return nx.Graph(edges) +``` + +### Required Components + +| Component | Description | +|-----------|-------------| +| `type_name` | Unique identifier used in table definitions with `` syntax | +| `dtype` | Underlying DataJoint type for storage (e.g., `"longblob"`, `"varchar(255)"`, `"json"`) | +| `encode(value, *, key=None)` | Converts Python object to storable format | +| `decode(stored, *, key=None)` | Converts stored data back to Python object | + +### Using Custom Types in Tables + +Once registered, use the type in table definitions with angle brackets: + +```python +@schema +class Connectivity(dj.Manual): + definition = """ + conn_id : int + --- + conn_graph = null : # Uses the GraphType we defined + """ +``` + +Insert and fetch work seamlessly: + +```python +import networkx as nx + +# Insert - encode() is called automatically +g = nx.lollipop_graph(4, 2) +Connectivity.insert1({"conn_id": 1, "conn_graph": g}) + +# Fetch - decode() is called automatically +result = (Connectivity & "conn_id = 1").fetch1("conn_graph") +assert isinstance(result, nx.Graph) +``` + +## Type Registration + +### Decorator Registration + +The simplest way to register a type is with the `@dj.register_type` decorator: + +```python +@dj.register_type +class MyType(dj.AttributeType): + type_name = "my_type" + ... +``` + +### Direct Registration + +You can also register types explicitly: + +```python +class MyType(dj.AttributeType): + type_name = "my_type" + ... + +dj.register_type(MyType) +``` + +### Listing Registered Types + +```python +# List all registered type names +print(dj.list_types()) +``` + +## Validation + +Add data validation by overriding the `validate()` method. It's called automatically +before `encode()` during INSERT operations: + +```python +@dj.register_type +class PositiveArrayType(dj.AttributeType): + type_name = "positive_array" + dtype = "longblob" + + def validate(self, value): + """Ensure all values are positive.""" + import numpy as np + if not isinstance(value, np.ndarray): + raise TypeError(f"Expected numpy array, got {type(value).__name__}") + if np.any(value < 0): + raise ValueError("Array must contain only positive values") + + def encode(self, array, *, key=None): + return array -1. Centralized Data Management: All experimental data and analysis results are stored - together for easy access and querying. -2. Reproducibility: The exact graph objects used in analysis can be retrieved later for - validation or further exploration. -3. Scalability: Graph data can be integrated into workflows for larger datasets or - across experiments. + def decode(self, stored, *, key=None): + return stored +``` + +## Storage Types (dtype) + +The `dtype` property specifies how data is stored in the database: + +| dtype | Use Case | Stored Format | +|-------|----------|---------------| +| `"longblob"` | Complex Python objects, arrays | Serialized binary | +| `"blob"` | Smaller objects | Serialized binary | +| `"json"` | JSON-serializable data | JSON string | +| `"varchar(N)"` | String representations | Text | +| `"int"` | Integer identifiers | Integer | +| `"blob@store"` | Large objects in external storage | UUID reference | +| `"object"` | Files/folders in object storage | JSON metadata | +| `""` | Chain to another custom type | Varies | + +### External Storage + +For large data, use external blob storage: + +```python +@dj.register_type +class LargeArrayType(dj.AttributeType): + type_name = "large_array" + dtype = "blob@mystore" # Uses external store named "mystore" + + def encode(self, array, *, key=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## Type Chaining + +Custom types can build on other custom types by referencing them in `dtype`: + +```python +@dj.register_type +class CompressedGraphType(dj.AttributeType): + type_name = "compressed_graph" + dtype = "" # Chain to the GraphType + + def encode(self, graph, *, key=None): + # Compress before passing to GraphType + return self._compress(graph) + + def decode(self, stored, *, key=None): + # GraphType's decode already ran + return self._decompress(stored) +``` + +DataJoint automatically resolves the chain to find the final storage type. + +## The Key Parameter + +The `key` parameter provides access to primary key values during encode/decode +operations. This is useful when the conversion depends on record context: + +```python +@dj.register_type +class ContextAwareType(dj.AttributeType): + type_name = "context_aware" + dtype = "longblob" + + def encode(self, value, *, key=None): + if key and key.get("version") == 2: + return self._encode_v2(value) + return self._encode_v1(value) + + def decode(self, stored, *, key=None): + if key and key.get("version") == 2: + return self._decode_v2(stored) + return self._decode_v1(stored) +``` + +## Publishing Custom Types as Packages -However, since graphs are not natively supported by relational databases, here’s where -`dj.AttributeAdapter` becomes essential. It allows researchers to define custom logic for -serializing graphs (e.g., as edge lists) and deserializing them back into Python -objects, bridging the gap between advanced data types and the database. +Custom types can be distributed as installable packages using Python entry points. +This allows types to be automatically discovered when the package is installed. -### Example: Storing Graphs in DataJoint +### Package Structure -To store a networkx.Graph object in a DataJoint table, researchers can define a custom -attribute type in a datajoint table class: +``` +dj-graph-types/ +├── pyproject.toml +└── src/ + └── dj_graph_types/ + ├── __init__.py + └── types.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-types" +version = "1.0.0" + +[project.entry-points."datajoint.types"] +graph = "dj_graph_types.types:GraphType" +weighted_graph = "dj_graph_types.types:WeightedGraphType" +``` + +### Type Implementation ```python +# src/dj_graph_types/types.py import datajoint as dj +import networkx as nx -class GraphAdapter(dj.AttributeAdapter): +class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return list(graph.edges) + + def decode(self, edges, *, key=None): + return nx.Graph(edges) + +class WeightedGraphType(dj.AttributeType): + type_name = "weighted_graph" + dtype = "longblob" + + def encode(self, graph, *, key=None): + return [(u, v, d) for u, v, d in graph.edges(data=True)] + + def decode(self, edges, *, key=None): + g = nx.Graph() + g.add_weighted_edges_from(edges) + return g +``` + +### Usage After Installation + +```bash +pip install dj-graph-types +``` + +```python +# Types are automatically available after package installation +@schema +class MyTable(dj.Manual): + definition = """ + id : int + --- + network : + weighted_network : + """ +``` + +## Complete Example + +Here's a complete example demonstrating custom types for a neuroscience workflow: + +```python +import datajoint as dj +import numpy as np + +# Configure DataJoint +dj.config["database.host"] = "localhost" +dj.config["database.user"] = "root" +dj.config["database.password"] = "password" + +# Define custom types +@dj.register_type +class SpikeTrainType(dj.AttributeType): + """Efficient storage for sparse spike timing data.""" + type_name = "spike_train" + dtype = "longblob" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) - attribute_type = 'longblob' # this is how the attribute will be declared + +@dj.register_type +class WaveformType(dj.AttributeType): + """Storage for spike waveform templates with metadata.""" + type_name = "waveform" + dtype = "longblob" + + def encode(self, waveform_dict, *, key=None): + return { + "data": waveform_dict["data"].astype(np.float32), + "sampling_rate": waveform_dict["sampling_rate"], + "channel_ids": list(waveform_dict["channel_ids"]), + } + + def decode(self, stored, *, key=None): + return { + "data": stored["data"].astype(np.float64), + "sampling_rate": stored["sampling_rate"], + "channel_ids": np.array(stored["channel_ids"]), + } + + +# Create schema and tables +schema = dj.schema("ephys_analysis") + +@schema +class Unit(dj.Manual): + definition = """ + unit_id : int + --- + spike_times : + waveform : + quality : enum('good', 'mua', 'noise') + """ + + +# Usage +spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) +waveform = { + "data": np.random.randn(82, 4), + "sampling_rate": 30000, + "channel_ids": [10, 11, 12, 13], +} + +Unit.insert1({ + "unit_id": 1, + "spike_times": spike_times, + "waveform": waveform, + "quality": "good", +}) + +# Fetch - automatically decoded +result = (Unit & "unit_id = 1").fetch1() +print(f"Spike times: {result['spike_times']}") +print(f"Waveform shape: {result['waveform']['data'].shape}") +``` + +## Migration from AttributeAdapter + +The `AttributeAdapter` class is deprecated. Migrate to `AttributeType`: + +### Before (deprecated) + +```python +class GraphAdapter(dj.AttributeAdapter): + attribute_type = "longblob" def put(self, obj): - # convert the nx.Graph object into an edge list - assert isinstance(obj, nx.Graph) return list(obj.edges) def get(self, value): - # convert edge list back into an nx.Graph return nx.Graph(value) - -# instantiate for use as a datajoint type +# Required context-based registration graph = GraphAdapter() +schema = dj.schema("mydb", context={"graph": graph}) +``` + +### After (recommended) + +```python +@dj.register_type +class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, obj, *, key=None): + return list(obj.edges) + + def decode(self, value, *, key=None): + return nx.Graph(value) + +# Global registration - no context needed +schema = dj.schema("mydb") +``` + +### Key Differences + +| Aspect | AttributeAdapter (deprecated) | AttributeType (recommended) | +|--------|-------------------------------|----------------------------| +| Methods | `put()` / `get()` | `encode()` / `decode()` | +| Storage type | `attribute_type` | `dtype` | +| Type name | Variable name in context | `type_name` property | +| Registration | Context dict per schema | Global `@register_type` decorator | +| Validation | Manual | Built-in `validate()` method | +| Distribution | Copy adapter code | Entry point packages | +| Key access | Not available | Optional `key` parameter | + +## Best Practices + +1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) + +2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, external storage for large data + +3. **Add validation**: Use `validate()` to catch data errors early + +4. **Document your types**: Include docstrings explaining the expected input/output formats + +5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes + +6. **Consider versioning**: If your encoding format might change, include version information + +7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs + +```python +def test_graph_type_roundtrip(): + g = nx.lollipop_graph(4, 2) + t = GraphType() + + encoded = t.encode(g) + decoded = t.decode(encoded) + + assert set(g.edges) == set(decoded.edges) +``` +## Built-in Types -# define a table with a graph attribute -schema = dj.schema('test_graphs') +DataJoint includes a built-in type for explicit blob serialization: +### `` - DataJoint Blob Serialization +The `` type provides explicit control over DataJoint's native binary +serialization. It supports: + +- NumPy arrays (compatible with MATLAB) +- Python dicts, lists, tuples, sets +- datetime objects, Decimals, UUIDs +- Nested data structures +- Optional compression + +```python @schema -class Connectivity(dj.Manual): +class ProcessedData(dj.Manual): definition = """ - conn_id : int + data_id : int + --- + results : # Serialized Python objects + raw_bytes : longblob # Raw bytes (no serialization) + """ +``` + +#### When to Use `` + +- **Serialized data**: When storing Python objects (dicts, arrays, etc.) +- **New tables**: Prefer `` for automatic serialization +- **Migration**: Existing schemas with implicit serialization must migrate + +#### Raw Blob Behavior + +Plain `longblob` (and other blob variants) columns now store and return +**raw bytes** without automatic serialization: + +```python +@schema +class RawData(dj.Manual): + definition = """ + id : int + --- + raw_bytes : longblob # Stores/returns raw bytes + serialized : # Stores Python objects with serialization + """ + +# Raw bytes - no serialization +RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}}) + +row = (RawData & "id=1").fetch1() +row["raw_bytes"] # Returns: b"raw binary data" +row["serialized"] # Returns: {"key": "value"} +``` + +**Important**: Existing schemas that relied on implicit blob serialization +must be migrated to `` to preserve their behavior. + +## Schema Migration + +When upgrading existing schemas to use explicit type declarations, DataJoint +provides migration utilities. + +### Analyzing Blob Columns + +```python +import datajoint as dj + +schema = dj.schema("my_database") + +# Check migration status +status = dj.migrate.check_migration_status(schema) +print(f"Blob columns: {status['total_blob_columns']}") +print(f"Already migrated: {status['migrated']}") +print(f"Pending migration: {status['pending']}") +``` + +### Generating Migration SQL + +```python +# Preview migration (dry run) +result = dj.migrate.migrate_blob_columns(schema, dry_run=True) +for sql in result['sql_statements']: + print(sql) +``` + +### Applying Migration + +```python +# Apply migration +result = dj.migrate.migrate_blob_columns(schema, dry_run=False) +print(f"Migrated {result['migrated']} columns") +``` + +### Migration Details + +The migration updates MySQL column comments to include the type declaration. +This is a **metadata-only** change - the actual blob data format is unchanged. + +All blob type variants are handled: `tinyblob`, `blob`, `mediumblob`, `longblob`. + +Before migration: +- Column: `longblob` (or `blob`, `mediumblob`, etc.) +- Comment: `user comment` +- Behavior: Auto-serialization (implicit) + +After migration: +- Column: `longblob` (unchanged) +- Comment: `::user comment` +- Behavior: Explicit serialization via `` + +### Updating Table Definitions + +After database migration, update your Python table definitions for consistency: + +```python +# Before +class MyTable(dj.Manual): + definition = """ + id : int + --- + data : longblob # stored data + """ + +# After +class MyTable(dj.Manual): + definition = """ + id : int --- - conn_graph = null : # a networkx.Graph object + data : # stored data """ ``` + +Both definitions work identically after migration, but using `` makes +the serialization explicit and documents the intended behavior. diff --git a/docs/src/design/tables/master-part.md b/docs/src/design/tables/master-part.md index 629bfb8ab..d0f575e4d 100644 --- a/docs/src/design/tables/master-part.md +++ b/docs/src/design/tables/master-part.md @@ -26,8 +26,8 @@ class Segmentation(dj.Computed): -> Segmentation roi : smallint # roi number --- - roi_pixels : longblob # indices of pixels - roi_weights : longblob # weights of pixels + roi_pixels : # indices of pixels + roi_weights : # weights of pixels """ def make(self, key): @@ -101,7 +101,7 @@ definition = """ -> ElectrodeResponse channel: int --- -response: longblob # response of a channel +response: # response of a channel """ ``` diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md new file mode 100644 index 000000000..c15a2292c --- /dev/null +++ b/docs/src/design/tables/storage-types-implementation-plan.md @@ -0,0 +1,464 @@ +# DataJoint Storage Types Redesign - Implementation Plan + +## Executive Summary + +This plan describes the implementation of a three-layer type architecture for DataJoint, building on the existing `AttributeType` infrastructure. The key goals are: + +1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes) +2. Implement content-addressed storage with deduplication +3. Provide composable, user-friendly types (``, ``, ``) +4. Enable project-wide garbage collection +5. Maintain backward compatibility with existing schemas + +--- + +## Implementation Status + +| Phase | Status | Notes | +|-------|--------|-------| +| Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution | +| Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table | +| Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders | +| Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType | +| Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | +| Phase 5: Garbage Collection | ✅ Complete | gc.py with scan/collect functions | +| Phase 6: Documentation and Testing | ✅ Complete | Test files for all new types | + +--- + +## Phase 1: Core Type System Foundation ✅ + +**Status**: Complete + +### Implemented in `src/datajoint/declare.py`: + +```python +CORE_TYPES = { + # Numeric types (aliased to native SQL) + "float32": (r"float32$", "float"), + "float64": (r"float64$", "double"), + "int64": (r"int64$", "bigint"), + "uint64": (r"uint64$", "bigint unsigned"), + "int32": (r"int32$", "int"), + "uint32": (r"uint32$", "int unsigned"), + "int16": (r"int16$", "smallint"), + "uint16": (r"uint16$", "smallint unsigned"), + "int8": (r"int8$", "tinyint"), + "uint8": (r"uint8$", "tinyint unsigned"), + "bool": (r"bool$", "tinyint"), + # UUID (stored as binary) + "uuid": (r"uuid$", "binary(16)"), + # JSON + "json": (r"json$", None), + # Binary (blob maps to longblob) + "blob": (r"blob$", "longblob"), + # Temporal + "date": (r"date$", None), + "datetime": (r"datetime$", None), + # String types (with parameters) + "char": (r"char\s*\(\d+\)$", None), + "varchar": (r"varchar\s*\(\d+\)$", None), + # Enumeration + "enum": (r"enum\s*\(.+\)$", None), +} +``` + +### Key changes: +- Removed `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` +- Core types are recorded in field comments with `:type:` syntax +- Non-standard native types pass through with warning +- `parse_type_spec()` handles `` syntax +- `resolve_dtype()` returns `(final_dtype, type_chain, store_name)` tuple + +--- + +## Phase 2: Content-Addressed Storage ✅ + +**Status**: Complete (simplified design) + +### Design Decision: Functions vs Class + +The original plan proposed a `ContentRegistry` class with a database table. We implemented a simpler, stateless approach using functions in `content_registry.py`: + +**Why functions instead of a registry table:** +1. **Simpler** - No additional database table to manage +2. **Decoupled** - Content storage is independent of any schema +3. **GC by scanning** - Garbage collection scans tables for references rather than maintaining reference counts +4. **Less state** - No synchronization issues between registry and actual storage + +### Implemented in `src/datajoint/content_registry.py`: + +```python +def compute_content_hash(data: bytes) -> str: + """Compute SHA256 hash of content.""" + return hashlib.sha256(data).hexdigest() + +def build_content_path(content_hash: str) -> str: + """Build path: _content/{hash[:2]}/{hash[2:4]}/{hash}""" + return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + +def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: + """Store content with deduplication. Returns {hash, store, size}.""" + ... + +def get_content(content_hash: str, store_name: str | None = None) -> bytes: + """Retrieve content by hash with verification.""" + ... + +def content_exists(content_hash: str, store_name: str | None = None) -> bool: + """Check if content exists.""" + ... + +def delete_content(content_hash: str, store_name: str | None = None) -> bool: + """Delete content (use with caution - verify no references first).""" + ... +``` + +### Implemented AttributeTypes in `src/datajoint/attribute_type.py`: + +```python +class ContentType(AttributeType): + """Content-addressed storage. Stores bytes, returns JSON metadata.""" + type_name = "content" + dtype = "json" + + def encode(self, value: bytes, *, key=None, store_name=None) -> dict: + return put_content(value, store_name=store_name) + + def decode(self, stored: dict, *, key=None) -> bytes: + return get_content(stored["hash"], store_name=stored.get("store")) + + +class XBlobType(AttributeType): + """External serialized blob using content-addressed storage.""" + type_name = "xblob" + dtype = "" # Composition + + def encode(self, value, *, key=None, store_name=None) -> bytes: + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key=None) -> Any: + return blob.unpack(stored, squeeze=False) +``` + +--- + +## Phase 2b: Path-Addressed Storage (ObjectType) ✅ + +**Status**: Complete + +### Design: Path vs Content Addressing + +| Aspect | `` | `` | +|--------|-------------|------------| +| Addressing | Content-hash (SHA256) | Path (from primary key) | +| Path Format | `_content/{hash[:2]}/{hash[2:4]}/{hash}` | `{schema}/{table}/objects/{pk}/{field}_{token}.ext` | +| Deduplication | Yes (same content = same hash) | No (each row has unique path) | +| Deletion | GC when unreferenced | Deleted with row | +| Use case | Serialized blobs, attachments | Zarr, HDF5, folders | + +### Implemented in `src/datajoint/builtin_types.py`: + +```python +@register_type +class ObjectType(AttributeType): + """Path-addressed storage for files and folders.""" + type_name = "object" + dtype = "json" + + def encode(self, value, *, key=None, store_name=None) -> dict: + # value can be bytes, str path, or Path + # key contains _schema, _table, _field for path construction + path, token = build_object_path(schema, table, field, primary_key, ext) + backend.put_buffer(content, path) # or put_folder for directories + return { + "path": path, + "store": store_name, + "size": size, + "ext": ext, + "is_dir": is_dir, + "timestamp": timestamp.isoformat(), + } + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + # Returns lazy handle for fsspec-based access + return ObjectRef.from_json(stored, backend=backend) +``` + +### ObjectRef Features: +- `ref.path` - Storage path +- `ref.read()` - Read file content +- `ref.open()` - Open as file handle +- `ref.fsmap` - For `zarr.open(ref.fsmap)` +- `ref.download(dest)` - Download to local path +- `ref.listdir()` / `ref.walk()` - For directories + +### Staged Insert for Object Types + +For large objects like Zarr arrays, `staged_insert.py` provides direct writes to storage: + +```python +with table.staged_insert1 as staged: + # 1. Set primary key first (required for path construction) + staged.rec['subject_id'] = 123 + staged.rec['session_id'] = 45 + + # 2. Get storage handle and write directly + z = zarr.open(staged.store('raw_data', '.zarr'), mode='w') + z[:] = large_array + + # 3. On exit: metadata computed, record inserted +``` + +**Flow comparison:** + +| Normal Insert | Staged Insert | +|--------------|---------------| +| `ObjectType.encode()` uploads content | Direct writes via `staged.store()` | +| Single operation | Two-phase: write then finalize | +| Good for files/folders | Ideal for Zarr, HDF5, streaming | + +Both produce the same JSON metadata format compatible with `ObjectRef.from_json()`. + +**Key methods:** +- `staged.store(field, ext)` - Returns `FSMap` for Zarr/xarray +- `staged.open(field, ext)` - Returns file handle for binary writes +- `staged.fs` - Raw fsspec filesystem access + +--- + +## Phase 3: User-Defined AttributeTypes ✅ + +**Status**: Complete + +All built-in AttributeTypes are implemented in `src/datajoint/builtin_types.py`. + +### 3.1 XBlobType ✅ +External serialized blobs using content-addressed storage. Composes with ``. + +### 3.2 AttachType ✅ + +```python +@register_type +class AttachType(AttributeType): + """Internal file attachment stored in database.""" + type_name = "attach" + dtype = "longblob" + + def encode(self, filepath, *, key=None, store_name=None) -> bytes: + # Returns: filename (UTF-8) + null byte + contents + return path.name.encode("utf-8") + b"\x00" + path.read_bytes() + + def decode(self, stored, *, key=None) -> str: + # Extracts to download_path, returns local path + ... +``` + +### 3.3 XAttachType ✅ + +```python +@register_type +class XAttachType(AttributeType): + """External file attachment using content-addressed storage.""" + type_name = "xattach" + dtype = "" # Composes with ContentType + # Same encode/decode as AttachType, but stored externally with dedup +``` + +### 3.4 FilepathType ✅ + +```python +@register_type +class FilepathType(AttributeType): + """Reference to existing file in configured store.""" + type_name = "filepath" + dtype = "json" + + def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: + # Verifies file exists, returns metadata + return {'path': path, 'store': store_name, 'size': size, ...} + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + # Returns ObjectRef for lazy access + return ObjectRef.from_json(stored, backend=backend) +``` + +### Type Comparison + +| Type | Storage | Copies File | Dedup | Returns | +|------|---------|-------------|-------|---------| +| `` | Database | Yes | No | Local path | +| `` | External | Yes | Yes | Local path | +| `` | Reference | No | N/A | ObjectRef | +| `` | External | Yes | No | ObjectRef | + +--- + +## Phase 4: Insert and Fetch Integration ✅ + +**Status**: Complete + +### Updated in `src/datajoint/table.py`: + +```python +def __make_placeholder(self, name, value, ...): + if attr.adapter: + from .attribute_type import resolve_dtype + attr.adapter.validate(value) + _, type_chain, resolved_store = resolve_dtype( + f"<{attr.adapter.type_name}>", store_name=attr.store + ) + # Apply type chain: outermost → innermost + for attr_type in type_chain: + try: + value = attr_type.encode(value, key=None, store_name=resolved_store) + except TypeError: + value = attr_type.encode(value, key=None) +``` + +### Updated in `src/datajoint/fetch.py`: + +```python +def _get(connection, attr, data, squeeze, download_path): + if attr.adapter: + from .attribute_type import resolve_dtype + final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") + + # Parse JSON if final storage is JSON + if final_dtype.lower() == "json": + data = json.loads(data) + + # Apply type chain in reverse: innermost → outermost + for attr_type in reversed(type_chain): + data = attr_type.decode(data, key=None) + + return data +``` + +--- + +## Phase 5: Garbage Collection ✅ + +**Status**: Complete + +### Implemented in `src/datajoint/gc.py`: + +```python +import datajoint as dj + +# Scan schemas and find orphaned content/objects +stats = dj.gc.scan(schema1, schema2, store_name='mystore') + +# Remove orphaned content/objects (dry_run=False to actually delete) +stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) + +# Format statistics for display +print(dj.gc.format_stats(stats)) +``` + +**Supported storage patterns:** + +1. **Content-Addressed Storage** (``, ``, ``): + - Stored at: `_content/{hash[:2]}/{hash[2:4]}/{hash}` + - Referenced by SHA256 hash in JSON metadata + +2. **Path-Addressed Storage** (``): + - Stored at: `{schema}/{table}/objects/{pk}/{field}_{token}/` + - Referenced by path in JSON metadata + +**Key functions:** +- `scan_references(*schemas, store_name=None)` - Scan tables for content hashes +- `scan_object_references(*schemas, store_name=None)` - Scan tables for object paths +- `list_stored_content(store_name=None)` - List all content in `_content/` directory +- `list_stored_objects(store_name=None)` - List all objects in `*/objects/` directories +- `scan(*schemas, store_name=None)` - Find orphaned content/objects without deleting +- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content/objects +- `delete_object(path, store_name=None)` - Delete an object directory +- `format_stats(stats)` - Human-readable statistics output + +**GC Process:** +1. Scan all tables in provided schemas for content-type and object-type attributes +2. Extract content hashes and object paths from JSON metadata columns +3. Scan storage for all stored content (`_content/`) and objects (`*/objects/`) +4. Compute orphaned = stored - referenced (for both types) +5. Optionally delete orphaned items (when `dry_run=False`) + +--- + +## Phase 6: Documentation and Testing ✅ + +**Status**: Complete + +### Test files created: +- `tests/test_content_storage.py` - Content-addressed storage functions +- `tests/test_type_composition.py` - Type chain encoding/decoding +- `tests/test_gc.py` - Garbage collection +- `tests/test_attribute_type.py` - AttributeType registry and DJBlobType (existing) + +--- + +## Critical Files Summary + +| File | Status | Changes | +|------|--------|---------| +| `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation | +| `src/datajoint/heading.py` | ✅ | Simplified attribute properties | +| `src/datajoint/attribute_type.py` | ✅ | Base class, registry, type chain resolution | +| `src/datajoint/builtin_types.py` | ✅ | DJBlobType, ContentType, XBlobType, ObjectType | +| `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) | +| `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access | +| `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path | +| `src/datajoint/staged_insert.py` | ✅ | Staged insert for direct object storage writes | +| `src/datajoint/table.py` | ✅ | Type chain encoding on insert | +| `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | +| `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | +| `src/datajoint/gc.py` | ✅ | Garbage collection for content storage | +| `tests/test_content_storage.py` | ✅ | Tests for content_registry.py | +| `tests/test_type_composition.py` | ✅ | Tests for type chain encoding/decoding | +| `tests/test_gc.py` | ✅ | Tests for garbage collection | + +--- + +## Removed/Deprecated + +- `src/datajoint/attribute_adapter.py` - Deleted (hard deprecated) +- `bypass_serialization` flag in `blob.py` - Removed +- `database` field in Attribute - Removed (unused) +- `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` - Removed +- `is_attachment`, `is_filepath`, `is_object`, `is_external` flags - Removed + +--- + +## Architecture Summary + +``` +Layer 3: AttributeTypes (user-facing) + , , , , , , + ↓ encode() / ↑ decode() + +Layer 2: Core DataJoint Types + float32, int64, uuid, json, blob, varchar(n), etc. + ↓ SQL mapping + +Layer 1: Native Database Types + FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc. +``` + +**Built-in AttributeTypes:** +``` + → longblob (internal serialized storage) + → longblob (internal file attachment) + → json (path-addressed, for Zarr/HDF5/folders) + → json (reference to existing file in store) + → json (content-addressed with deduplication) + → json (external serialized with dedup) + → json (external file attachment with dedup) +``` + +**Type Composition Example:** +``` + → json (in DB) + +Insert: Python object → blob.pack() → put_content() → JSON metadata +Fetch: JSON metadata → get_content() → blob.unpack() → Python object +``` diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md new file mode 100644 index 000000000..668fdfdf5 --- /dev/null +++ b/docs/src/design/tables/storage-types-spec.md @@ -0,0 +1,665 @@ +# Storage Types Redesign Spec + +## Overview + +This document defines a three-layer type architecture: + +1. **Native database types** - Backend-specific (`FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB`). Discouraged for direct use. +2. **Core DataJoint types** - Standardized across backends, scientist-friendly (`float32`, `uint8`, `bool`, `json`). +3. **AttributeTypes** - Programmatic types with `encode()`/`decode()` semantics. Composable. + +``` +┌───────────────────────────────────────────────────────────────────┐ +│ AttributeTypes (Layer 3) │ +│ │ +│ Built-in: │ +│ User: ... │ +├───────────────────────────────────────────────────────────────────┤ +│ Core DataJoint Types (Layer 2) │ +│ │ +│ float32 float64 int64 uint64 int32 uint32 int16 uint16 │ +│ int8 uint8 bool uuid json blob date datetime │ +│ char(n) varchar(n) enum(...) │ +├───────────────────────────────────────────────────────────────────┤ +│ Native Database Types (Layer 1) │ +│ │ +│ MySQL: TINYINT SMALLINT INT BIGINT FLOAT DOUBLE ... │ +│ PostgreSQL: SMALLINT INTEGER BIGINT REAL DOUBLE PRECISION │ +│ (pass through with warning for non-standard types) │ +└───────────────────────────────────────────────────────────────────┘ +``` + +**Syntax distinction:** +- Core types: `int32`, `float64`, `varchar(255)` - no brackets +- AttributeTypes: ``, ``, `` - angle brackets + +### OAS Storage Regions + +| Region | Path Pattern | Addressing | Use Case | +|--------|--------------|------------|----------| +| Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 | +| Content | `_content/{hash}` | Content hash | Deduplicated blobs/files | + +### External References + +`` provides portable relative paths within configured stores with lazy ObjectRef access. +For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. + +## Core DataJoint Types (Layer 2) + +Core types provide a standardized, scientist-friendly interface that works identically across +MySQL and PostgreSQL backends. Users should prefer these over native database types. + +**All core types are recorded in field comments using `:type:` syntax for reconstruction.** + +### Numeric Types + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `int8` | 8-bit signed | `TINYINT` | +| `int16` | 16-bit signed | `SMALLINT` | +| `int32` | 32-bit signed | `INT` | +| `int64` | 64-bit signed | `BIGINT` | +| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | +| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | +| `uint32` | 32-bit unsigned | `INT UNSIGNED` | +| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | +| `float32` | 32-bit float | `FLOAT` | +| `float64` | 64-bit float | `DOUBLE` | + +### String Types + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `char(n)` | Fixed-length | `CHAR(n)` | +| `varchar(n)` | Variable-length | `VARCHAR(n)` | + +### Boolean + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `bool` | True/False | `TINYINT` | + +### Date/Time Types + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `date` | Date only | `DATE` | +| `datetime` | Date and time | `DATETIME` | + +### Binary Types + +The core `blob` type stores raw bytes without any serialization. Use `` AttributeType +for serialized Python objects. + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `blob` | Raw bytes | `LONGBLOB` | + +### Other Types + +| Core Type | Description | MySQL | +|-----------|-------------|-------| +| `json` | JSON document | `JSON` | +| `uuid` | UUID | `BINARY(16)` | +| `enum(...)` | Enumeration | `ENUM(...)` | + +### Native Passthrough Types + +Users may use native database types directly (e.g., `text`, `mediumint auto_increment`), +but these will generate a warning about non-standard usage. Native types are not recorded +in field comments and may have portability issues across database backends. + +## AttributeTypes (Layer 3) + +AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are +composable and can be built-in or user-defined. + +### `` / `` - Path-Addressed Storage + +**Built-in AttributeType.** OAS (Object-Augmented Schema) storage: + +- Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/` +- One-to-one relationship with table row +- Deleted when row is deleted +- Returns `ObjectRef` for lazy access +- Supports direct writes (Zarr, HDF5) via fsspec +- **dtype**: `json` (stores path, store name, metadata) + +```python +class Analysis(dj.Computed): + definition = """ + -> Recording + --- + results : # default store + archive : # specific store + """ +``` + +#### Implementation + +```python +class ObjectType(AttributeType): + """Built-in AttributeType for path-addressed OAS storage.""" + type_name = "object" + dtype = "json" + + def encode(self, value, *, key=None, store_name=None) -> dict: + store = get_store(store_name or dj.config['stores']['default']) + path = self._compute_path(key) # {schema}/{table}/{pk}/{attr}/ + store.put(path, value) + return { + "path": path, + "store": store_name, + # Additional metadata (size, timestamps, etc.) + } + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + return ObjectRef( + store=get_store(stored["store"]), + path=stored["path"] + ) +``` + +### `` / `` - Content-Addressed Storage + +**Built-in AttributeType.** Content-addressed storage with deduplication: + +- **Single blob only**: stores a single file or serialized object (not folders) +- **Per-project scope**: content is shared across all schemas in a project (not per-schema) +- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}` +- Many-to-one: multiple rows (even across schemas) can reference same content +- Reference counted for garbage collection +- Deduplication: identical content stored once across the entire project +- For folders/complex objects, use `object` type instead +- **dtype**: `json` (stores hash, store name, size, metadata) + +``` +store_root/ +├── {schema}/{table}/{pk}/ # object storage (path-addressed by PK) +│ └── {attribute}/ +│ +└── _content/ # content storage (content-addressed) + └── {hash[:2]}/{hash[2:4]}/{hash} +``` + +#### Implementation + +```python +class ContentType(AttributeType): + """Built-in AttributeType for content-addressed storage.""" + type_name = "content" + dtype = "json" + + def encode(self, data: bytes, *, key=None, store_name=None) -> dict: + """Store content, return metadata as JSON.""" + content_hash = hashlib.sha256(data).hexdigest() + store = get_store(store_name or dj.config['stores']['default']) + path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + + if not store.exists(path): + store.put(path, data) + ContentRegistry().insert1({ + 'content_hash': content_hash, + 'store': store_name, + 'size': len(data) + }, skip_duplicates=True) + + return { + "hash": content_hash, + "store": store_name, + "size": len(data) + } + + def decode(self, stored: dict, *, key=None) -> bytes: + """Retrieve content by hash.""" + store = get_store(stored["store"]) + path = f"_content/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" + return store.get(path) +``` + +#### Database Column + +The `` type stores JSON metadata: + +```sql +-- content column (MySQL) +features JSON NOT NULL +-- Contains: {"hash": "abc123...", "store": "main", "size": 12345} + +-- content column (PostgreSQL) +features JSONB NOT NULL +``` + +### `` - Portable External Reference + +**Built-in AttributeType.** Relative path references within configured stores: + +- **Relative paths**: paths within a configured store (portable across environments) +- **Store-aware**: resolves paths against configured store backend +- Returns `ObjectRef` for lazy access via fsspec +- Stores optional checksum for verification +- **dtype**: `json` (stores path, store name, checksum, metadata) + +**Key benefit**: Portability. The path is relative to the store, so pipelines can be moved +between environments (dev → prod, cloud → local) by changing store configuration without +updating data. + +```python +class RawData(dj.Manual): + definition = """ + session_id : int32 + --- + recording : # relative path within 'main' store + """ + +# Insert - user provides relative path within the store +table.insert1({ + 'session_id': 1, + 'recording': 'experiment_001/data.nwb' # relative to main store root +}) + +# Fetch - returns ObjectRef (lazy) +row = (table & 'session_id=1').fetch1() +ref = row['recording'] # ObjectRef +ref.download('/local/path') # explicit download +ref.open() # fsspec streaming access +``` + +#### When to Use `` vs `varchar` + +| Use Case | Recommended Type | +|----------|------------------| +| Need ObjectRef/lazy access | `` | +| Need portability (relative paths) | `` | +| Want checksum verification | `` | +| Just storing a URL string | `varchar` | +| External URLs you don't control | `varchar` | + +For arbitrary URLs (S3, HTTP, etc.) where you don't need ObjectRef semantics, +just use `varchar`. A string is simpler and more transparent. + +#### Implementation + +```python +class FilepathType(AttributeType): + """Built-in AttributeType for store-relative file references.""" + type_name = "filepath" + dtype = "json" + + def encode(self, relative_path: str, *, key=None, store_name=None, + compute_checksum: bool = False) -> dict: + """Register reference to file in store.""" + store = get_store(store_name) # store_name required for filepath + metadata = {'path': relative_path, 'store': store_name} + + if compute_checksum: + full_path = store.resolve(relative_path) + if store.exists(full_path): + metadata['checksum'] = compute_file_checksum(store, full_path) + metadata['size'] = store.size(full_path) + + return metadata + + def decode(self, stored: dict, *, key=None) -> ObjectRef: + """Return ObjectRef for lazy access.""" + return ObjectRef( + store=get_store(stored['store']), + path=stored['path'], + checksum=stored.get('checksum') # optional verification + ) +``` + +#### Database Column + +```sql +-- filepath column (MySQL) +recording JSON NOT NULL +-- Contains: {"path": "experiment_001/data.nwb", "store": "main", "checksum": "...", "size": ...} + +-- filepath column (PostgreSQL) +recording JSONB NOT NULL +``` + +#### Key Differences from Legacy `filepath@store` (now ``) + +| Feature | Legacy | New | +|---------|--------|-----| +| Access | Copy to local stage | ObjectRef (lazy) | +| Copying | Automatic | Explicit via `ref.download()` | +| Streaming | No | Yes via `ref.open()` | +| Paths | Relative | Relative (unchanged) | +| Store param | Required (`@store`) | Required (`@store`) | + +## Database Types + +### `json` - Cross-Database JSON Type + +JSON storage compatible across MySQL and PostgreSQL: + +```sql +-- MySQL +column_name JSON NOT NULL + +-- PostgreSQL (uses JSONB for better indexing) +column_name JSONB NOT NULL +``` + +The `json` database type: +- Used as dtype by built-in AttributeTypes (``, ``, ``) +- Stores arbitrary JSON-serializable data +- Automatically uses appropriate type for database backend +- Supports JSON path queries where available + +## Parameterized AttributeTypes + +AttributeTypes can be parameterized with `` syntax. The parameter specifies +which store to use: + +```python +class AttributeType: + type_name: str # Name used in or as bare type + dtype: str # Database type or built-in AttributeType + + # When user writes type_name@param, resolved store becomes param +``` + +**Resolution examples:** +``` + → uses type → default store + → uses type → cold store + → dtype = "longblob" → database (no store) + → uses type → cold store +``` + +AttributeTypes can use other AttributeTypes as their dtype (composition): +- `` uses `` - adds djblob serialization on top of content-addressed storage +- `` uses `` - adds filename preservation on top of content-addressed storage + +## User-Defined AttributeTypes + +### `` - Internal Serialized Blob + +Serialized Python object stored in database. + +```python +@dj.register_type +class DJBlobType(AttributeType): + type_name = "djblob" + dtype = "longblob" # MySQL type + + def encode(self, value, *, key=None) -> bytes: + from . import blob + return blob.pack(value, compress=True) + + def decode(self, stored, *, key=None) -> Any: + from . import blob + return blob.unpack(stored) +``` + +### `` / `` - External Serialized Blob + +Serialized Python object stored in content-addressed storage. + +```python +@dj.register_type +class XBlobType(AttributeType): + type_name = "xblob" + dtype = "content" # Core type - uses default store + # dtype = "content@store" for specific store + + def encode(self, value, *, key=None) -> bytes: + from . import blob + return blob.pack(value, compress=True) + + def decode(self, stored, *, key=None) -> Any: + from . import blob + return blob.unpack(stored) +``` + +Usage: +```python +class ProcessedData(dj.Computed): + definition = """ + -> RawData + --- + small_result : # internal (in database) + large_result : # external (default store) + archive_result : # external (specific store) + """ +``` + +### `` - Internal File Attachment + +File stored in database with filename preserved. + +```python +@dj.register_type +class AttachType(AttributeType): + type_name = "attach" + dtype = "longblob" + + def encode(self, filepath, *, key=None) -> bytes: + path = Path(filepath) + return path.name.encode() + b"\0" + path.read_bytes() + + def decode(self, stored, *, key=None) -> str: + filename, contents = stored.split(b"\0", 1) + filename = filename.decode() + download_path = Path(dj.config['download_path']) / filename + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(contents) + return str(download_path) +``` + +### `` / `` - External File Attachment + +File stored in content-addressed storage with filename preserved. + +```python +@dj.register_type +class XAttachType(AttributeType): + type_name = "xattach" + dtype = "content" # Core type + + def encode(self, filepath, *, key=None) -> bytes: + path = Path(filepath) + # Include filename in stored data + return path.name.encode() + b"\0" + path.read_bytes() + + def decode(self, stored, *, key=None) -> str: + filename, contents = stored.split(b"\0", 1) + filename = filename.decode() + download_path = Path(dj.config['download_path']) / filename + download_path.parent.mkdir(parents=True, exist_ok=True) + download_path.write_bytes(contents) + return str(download_path) +``` + +Usage: +```python +class Attachments(dj.Manual): + definition = """ + attachment_id : int + --- + config : # internal (small file in DB) + data_file : # external (default store) + archive : # external (specific store) + """ +``` + +## Storage Comparison + +| Type | dtype | Storage Location | Dedup | Returns | +|------|-------|------------------|-------|---------| +| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `_content/{hash}` | Yes | bytes | +| `` | `json` | `_content/{hash}` | Yes | bytes | +| `` | `json` | Configured store (relative path) | No | ObjectRef | +| `` | `longblob` | Database | No | Python object | +| `` | `` | `_content/{hash}` | Yes | Python object | +| `` | `` | `_content/{hash}` | Yes | Python object | +| `` | `longblob` | Database | No | Local file path | +| `` | `` | `_content/{hash}` | Yes | Local file path | +| `` | `` | `_content/{hash}` | Yes | Local file path | + +## Reference Counting for Content Type + +The `ContentRegistry` is a **project-level** table that tracks content-addressed objects +across all schemas. This differs from the legacy `~external_*` tables which were per-schema. + +```python +class ContentRegistry: + """ + Project-level content registry. + Stored in a designated database (e.g., `{project}_content`). + """ + definition = """ + # Content-addressed object registry (project-wide) + content_hash : char(64) # SHA256 hex + --- + store : varchar(64) # Store name + size : bigint unsigned # Size in bytes + created : timestamp DEFAULT CURRENT_TIMESTAMP + """ +``` + +Garbage collection scans **all schemas** in the project: + +```python +def garbage_collect(project): + """Remove content not referenced by any table in any schema.""" + # Get all registered hashes + registered = set(ContentRegistry().fetch('content_hash', 'store')) + + # Get all referenced hashes from ALL schemas in the project + referenced = set() + for schema in project.schemas: + for table in schema.tables: + for attr in table.heading.attributes: + if attr.type in ('content', 'content@...'): + hashes = table.fetch(attr.name) + referenced.update((h, attr.store) for h in hashes) + + # Delete orphaned content + for content_hash, store in (registered - referenced): + store_backend = get_store(store) + store_backend.delete(content_path(content_hash)) + (ContentRegistry() & {'content_hash': content_hash}).delete() +``` + +## Built-in AttributeType Comparison + +| Feature | `` | `` | `` | +|---------|------------|-------------|---------------------| +| dtype | `json` | `json` | `json` | +| Location | OAS store | OAS store | Configured store | +| Addressing | Primary key | Content hash | Relative path | +| Path control | DataJoint | DataJoint | User | +| Deduplication | No | Yes | No | +| Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) | +| Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) | +| GC | Deleted with row | Reference counted | N/A (user managed) | +| Integrity | DataJoint managed | DataJoint managed | User managed | + +**When to use each:** +- **``**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) +- **``**: Deduplicated serialized data or file attachments via ``, `` +- **``**: Portable references to files in configured stores +- **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed + +## Key Design Decisions + +1. **Three-layer architecture**: + - Layer 1: Native database types (backend-specific, discouraged) + - Layer 2: Core DataJoint types (standardized, scientist-friendly) + - Layer 3: AttributeTypes (encode/decode, composable) +2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)` +3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types +4. **AttributeTypes are composable**: `` uses ``, which uses `json` +5. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.) +6. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores +7. **Filepath for portability**: `` uses relative paths within stores for environment portability +8. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent +9. **Content type**: Single-blob, content-addressed, deduplicated storage +10. **Parameterized types**: `` passes store parameter +11. **Naming convention**: + - `` = internal serialized (database) + - `` = external serialized (content-addressed) + - `` = internal file (single file) + - `` = external file (single file) +12. **Transparent access**: AttributeTypes return Python objects or file paths +13. **Lazy access**: ``, ``, and `` return ObjectRef + +## Migration from Legacy Types + +| Legacy | New Equivalent | +|--------|----------------| +| `longblob` (auto-serialized) | `` | +| `blob@store` | `` | +| `attach` | `` | +| `attach@store` | `` | +| `filepath@store` (copy-based) | `filepath@store` (ObjectRef-based, upgraded) | + +### Migration from Legacy `~external_*` Stores + +Legacy external storage used per-schema `~external_{store}` tables. Migration to the new +per-project `ContentRegistry` requires: + +```python +def migrate_external_store(schema, store_name): + """ + Migrate legacy ~external_{store} to new ContentRegistry. + + 1. Read all entries from ~external_{store} + 2. For each entry: + - Fetch content from legacy location + - Compute SHA256 hash + - Copy to _content/{hash}/ if not exists + - Update table column from UUID to hash + - Register in ContentRegistry + 3. After all schemas migrated, drop ~external_{store} tables + """ + external_table = schema.external[store_name] + + for entry in external_table.fetch(as_dict=True): + legacy_uuid = entry['hash'] + + # Fetch content from legacy location + content = external_table.get(legacy_uuid) + + # Compute new content hash + content_hash = hashlib.sha256(content).hexdigest() + + # Store in new location if not exists + new_path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + store = get_store(store_name) + if not store.exists(new_path): + store.put(new_path, content) + + # Register in project-wide ContentRegistry + ContentRegistry().insert1({ + 'content_hash': content_hash, + 'store': store_name, + 'size': len(content) + }, skip_duplicates=True) + + # Update referencing tables (UUID -> hash) + # ... update all tables that reference this UUID ... + + # After migration complete for all schemas: + # DROP TABLE `{schema}`.`~external_{store}` +``` + +**Migration considerations:** +- Legacy UUIDs were based on content hash but stored as `binary(16)` +- New system uses `char(64)` SHA256 hex strings +- Migration can be done incrementally per schema +- Backward compatibility layer can read both formats during transition + +## Open Questions + +1. Should `content` without `@store` use a default store, or require explicit store? +2. Should we support `` without `@store` syntax (implying default store)? +3. How long should the backward compatibility layer support legacy `~external_*` format? diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 2fba6bd84..a19aae6d0 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -45,8 +45,11 @@ "kill", "MatCell", "MatStruct", - "AttributeAdapter", + "AttributeType", + "register_type", + "list_types", "errors", + "migrate", "DataJointError", "key", "key_hash", @@ -56,8 +59,9 @@ ] from . import errors +from . import migrate from .admin import kill -from .attribute_adapter import AttributeAdapter +from .attribute_type import AttributeType, list_types, register_type from .blob import MatCell, MatStruct from .cli import cli from .connection import Connection, conn diff --git a/src/datajoint/attribute_adapter.py b/src/datajoint/attribute_adapter.py deleted file mode 100644 index 12a34f27e..000000000 --- a/src/datajoint/attribute_adapter.py +++ /dev/null @@ -1,61 +0,0 @@ -import re - -from .errors import DataJointError, _support_adapted_types - - -class AttributeAdapter: - """ - Base class for adapter objects for user-defined attribute types. - """ - - @property - def attribute_type(self): - """ - :return: a supported DataJoint attribute type to use; e.g. "longblob", "blob@store" - """ - raise NotImplementedError("Undefined attribute adapter") - - def get(self, value): - """ - convert value retrieved from the the attribute in a table into the adapted type - - :param value: value from the database - - :return: object of the adapted type - """ - raise NotImplementedError("Undefined attribute adapter") - - def put(self, obj): - """ - convert an object of the adapted type into a value that DataJoint can store in a table attribute - - :param obj: an object of the adapted type - :return: value to store in the database - """ - raise NotImplementedError("Undefined attribute adapter") - - -def get_adapter(context, adapter_name): - """ - Extract the AttributeAdapter object by its name from the context and validate. - """ - if not _support_adapted_types(): - raise DataJointError("Support for Adapted Attribute types is disabled.") - adapter_name = adapter_name.lstrip("<").rstrip(">") - try: - adapter = context[adapter_name] - except KeyError: - raise DataJointError("Attribute adapter '{adapter_name}' is not defined.".format(adapter_name=adapter_name)) - if not isinstance(adapter, AttributeAdapter): - raise DataJointError( - "Attribute adapter '{adapter_name}' must be an instance of datajoint.AttributeAdapter".format( - adapter_name=adapter_name - ) - ) - if not isinstance(adapter.attribute_type, str) or not re.match(r"^\w", adapter.attribute_type): - raise DataJointError( - "Invalid attribute type {type} in attribute adapter '{adapter_name}'".format( - type=adapter.attribute_type, adapter_name=adapter_name - ) - ) - return adapter diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py new file mode 100644 index 000000000..37fae88ca --- /dev/null +++ b/src/datajoint/attribute_type.py @@ -0,0 +1,497 @@ +""" +Custom attribute type system for DataJoint. + +This module provides the AttributeType base class and registration mechanism +for creating custom data types that extend DataJoint's native type system. + +Custom types enable seamless integration of complex Python objects (like NumPy arrays, +graphs, or domain-specific structures) with DataJoint's relational storage. + +Example: + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph: nx.Graph) -> list: + return list(graph.edges) + + def decode(self, edges: list) -> nx.Graph: + return nx.Graph(edges) + + # Then use in table definitions: + class MyTable(dj.Manual): + definition = ''' + id : int + --- + data : + ''' +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +from .errors import DataJointError + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__.split(".")[0]) + +# Global type registry - maps type_name to AttributeType instance +_type_registry: dict[str, AttributeType] = {} +_entry_points_loaded: bool = False + + +class AttributeType(ABC): + """ + Base class for custom DataJoint attribute types. + + Subclass this to create custom types that can be used in table definitions + with the ```` syntax. Custom types define bidirectional conversion + between Python objects and DataJoint's storage format. + + Attributes: + type_name: Unique identifier used in ```` syntax + dtype: Underlying DataJoint storage type + + Example: + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + dtype = "longblob" + + def encode(self, graph): + return list(graph.edges) + + def decode(self, edges): + import networkx as nx + return nx.Graph(edges) + + The type can then be used in table definitions:: + + class Connectivity(dj.Manual): + definition = ''' + id : int + --- + graph_data : + ''' + """ + + @property + @abstractmethod + def type_name(self) -> str: + """ + Unique identifier for this type, used in table definitions as ````. + + This name must be unique across all registered types. It should be lowercase + with underscores (e.g., "graph", "zarr_array", "compressed_image"). + + Returns: + The type name string without angle brackets. + """ + ... + + @property + @abstractmethod + def dtype(self) -> str: + """ + The underlying DataJoint type used for storage. + + Can be: + - A native type: ``"longblob"``, ``"blob"``, ``"varchar(255)"``, ``"int"``, ``"json"`` + - An external type: ``"blob@store"``, ``"attach@store"`` + - The object type: ``"object"`` + - Another custom type: ``""`` (enables type chaining) + + Returns: + The storage type specification string. + """ + ... + + @abstractmethod + def encode(self, value: Any, *, key: dict | None = None) -> Any: + """ + Convert a Python object to the storable format. + + Called during INSERT operations to transform user-provided objects + into a format suitable for storage in the underlying ``dtype``. + + Args: + value: The Python object to store. + key: Primary key values as a dict. Available when the dtype uses + object storage and may be needed for path construction. + + Returns: + Value in the format expected by ``dtype``. For example: + - For ``dtype="longblob"``: any picklable Python object + - For ``dtype="object"``: path string or file-like object + - For ``dtype="varchar(N)"``: string + """ + ... + + @abstractmethod + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """ + Convert stored data back to a Python object. + + Called during FETCH operations to reconstruct the original Python + object from the stored format. + + Args: + stored: Data retrieved from storage. Type depends on ``dtype``: + - For ``"object"``: an ``ObjectRef`` handle + - For blob types: the unpacked Python object + - For native types: the native Python value (str, int, etc.) + key: Primary key values as a dict. + + Returns: + The reconstructed Python object. + """ + ... + + def validate(self, value: Any) -> None: + """ + Validate a value before encoding. + + Override this method to add type checking or domain constraints. + Called automatically before ``encode()`` during INSERT operations. + The default implementation accepts any value. + + Args: + value: The value to validate. + + Raises: + TypeError: If the value has an incompatible type. + ValueError: If the value fails domain validation. + """ + pass + + def default(self) -> Any: + """ + Return a default value for this type. + + Override if the type has a sensible default value. The default + implementation raises NotImplementedError, indicating no default exists. + + Returns: + The default value for this type. + + Raises: + NotImplementedError: If no default exists (the default behavior). + """ + raise NotImplementedError(f"No default value for type <{self.type_name}>") + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}(type_name={self.type_name!r}, dtype={self.dtype!r})>" + + +def register_type(cls: type[AttributeType]) -> type[AttributeType]: + """ + Register a custom attribute type with DataJoint. + + Can be used as a decorator or called directly. The type becomes available + for use in table definitions with the ```` syntax. + + Args: + cls: An AttributeType subclass to register. + + Returns: + The same class, unmodified (allows use as decorator). + + Raises: + DataJointError: If a type with the same name is already registered + by a different class. + TypeError: If cls is not an AttributeType subclass. + + Example: + As a decorator:: + + @dj.register_type + class GraphType(dj.AttributeType): + type_name = "graph" + ... + + Or called directly:: + + dj.register_type(GraphType) + """ + if not isinstance(cls, type) or not issubclass(cls, AttributeType): + raise TypeError(f"register_type requires an AttributeType subclass, got {cls!r}") + + instance = cls() + name = instance.type_name + + if not isinstance(name, str) or not name: + raise DataJointError(f"type_name must be a non-empty string, got {name!r}") + + if name in _type_registry: + existing = _type_registry[name] + if type(existing) is not cls: + raise DataJointError( + f"Type <{name}> is already registered by " f"{type(existing).__module__}.{type(existing).__name__}" + ) + # Same class registered twice - idempotent, no error + return cls + + _type_registry[name] = instance + logger.debug(f"Registered attribute type <{name}> from {cls.__module__}.{cls.__name__}") + return cls + + +def parse_type_spec(spec: str) -> tuple[str, str | None]: + """ + Parse a type specification into type name and optional store parameter. + + Handles formats like: + - "" -> ("xblob", None) + - "" -> ("xblob", "cold") + - "xblob@cold" -> ("xblob", "cold") + - "xblob" -> ("xblob", None) + + Args: + spec: Type specification string, with or without angle brackets. + + Returns: + Tuple of (type_name, store_name). store_name is None if not specified. + """ + # Strip angle brackets + spec = spec.strip("<>").strip() + + if "@" in spec: + type_name, store_name = spec.split("@", 1) + return type_name.strip(), store_name.strip() + + return spec, None + + +def unregister_type(name: str) -> None: + """ + Remove a type from the registry. + + Primarily useful for testing. Use with caution in production code. + + Args: + name: The type_name to unregister. + + Raises: + DataJointError: If the type is not registered. + """ + name = name.strip("<>") + if name not in _type_registry: + raise DataJointError(f"Type <{name}> is not registered") + del _type_registry[name] + + +def get_type(name: str) -> AttributeType: + """ + Retrieve a registered attribute type by name. + + Looks up the type in the explicit registry first, then attempts + to load from installed packages via entry points. + + Args: + name: The type name, with or without angle brackets. + Store parameters (e.g., "") are stripped. + + Returns: + The registered AttributeType instance. + + Raises: + DataJointError: If the type is not found. + """ + # Strip angle brackets and store parameter + type_name, _ = parse_type_spec(name) + + # Check explicit registry first + if type_name in _type_registry: + return _type_registry[type_name] + + # Lazy-load entry points + _load_entry_points() + + if type_name in _type_registry: + return _type_registry[type_name] + + raise DataJointError( + f"Unknown attribute type: <{type_name}>. " + f"Ensure the type is registered via @dj.register_type or installed as a package." + ) + + +def list_types() -> list[str]: + """ + List all registered type names. + + Returns: + Sorted list of registered type names. + """ + _load_entry_points() + return sorted(_type_registry.keys()) + + +def is_type_registered(name: str) -> bool: + """ + Check if a type name is registered. + + Args: + name: The type name to check (store parameters are ignored). + + Returns: + True if the type is registered. + """ + type_name, _ = parse_type_spec(name) + if type_name in _type_registry: + return True + _load_entry_points() + return type_name in _type_registry + + +def _load_entry_points() -> None: + """ + Load attribute types from installed packages via entry points. + + Types are discovered from the ``datajoint.types`` entry point group. + Packages declare types in pyproject.toml:: + + [project.entry-points."datajoint.types"] + zarr_array = "dj_zarr:ZarrArrayType" + + This function is idempotent - entry points are only loaded once. + """ + global _entry_points_loaded + if _entry_points_loaded: + return + + _entry_points_loaded = True + + try: + from importlib.metadata import entry_points + except ImportError: + # Python < 3.10 fallback + try: + from importlib_metadata import entry_points + except ImportError: + logger.debug("importlib.metadata not available, skipping entry point discovery") + return + + try: + # Python 3.10+ / importlib_metadata 3.6+ + eps = entry_points(group="datajoint.types") + except TypeError: + # Older API + eps = entry_points().get("datajoint.types", []) + + for ep in eps: + if ep.name in _type_registry: + # Already registered explicitly, skip entry point + continue + try: + type_class = ep.load() + register_type(type_class) + logger.debug(f"Loaded attribute type <{ep.name}> from entry point {ep.value}") + except Exception as e: + logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}") + + +def resolve_dtype( + dtype: str, seen: set[str] | None = None, store_name: str | None = None +) -> tuple[str, list[AttributeType], str | None]: + """ + Resolve a dtype string, following type chains. + + If dtype references another custom type (e.g., ""), recursively + resolves to find the ultimate storage type. Store parameters are propagated + through the chain. + + Args: + dtype: The dtype string to resolve (e.g., "", "", "longblob"). + seen: Set of already-seen type names (for cycle detection). + store_name: Store name from outer type specification (propagated inward). + + Returns: + Tuple of (final_storage_type, list_of_types_in_chain, resolved_store_name). + The chain is ordered from outermost to innermost type. + + Raises: + DataJointError: If a circular type reference is detected. + + Examples: + >>> resolve_dtype("") + ("json", [XBlobType, ContentType], None) + + >>> resolve_dtype("") + ("json", [XBlobType, ContentType], "cold") + + >>> resolve_dtype("longblob") + ("longblob", [], None) + """ + if seen is None: + seen = set() + + chain: list[AttributeType] = [] + + # Check if dtype is a custom type reference + if dtype.startswith("<") and dtype.endswith(">"): + type_name, dtype_store = parse_type_spec(dtype) + + # Store from this level overrides inherited store + effective_store = dtype_store if dtype_store is not None else store_name + + if type_name in seen: + raise DataJointError(f"Circular type reference detected: <{type_name}>") + + seen.add(type_name) + attr_type = get_type(type_name) + chain.append(attr_type) + + # Recursively resolve the inner dtype, propagating store + inner_dtype, inner_chain, resolved_store = resolve_dtype(attr_type.dtype, seen, effective_store) + chain.extend(inner_chain) + return inner_dtype, chain, resolved_store + + # Not a custom type - check if it has a store suffix (e.g., "blob@store") + if "@" in dtype: + base_type, dtype_store = dtype.split("@", 1) + effective_store = dtype_store if dtype_store else store_name + return base_type, chain, effective_store + + # Plain type - return as-is with propagated store + return dtype, chain, store_name + + +def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: + """ + Get an attribute type by name. + + This is a compatibility function used by heading and declare modules. + + Args: + context: Ignored (legacy parameter, kept for API compatibility). + adapter_name: The type name, with or without angle brackets. + May include store parameter (e.g., ""). + + Returns: + Tuple of (AttributeType instance, store_name or None). + + Raises: + DataJointError: If the type is not found. + """ + type_name, store_name = parse_type_spec(adapter_name) + + if is_type_registered(type_name): + return get_type(type_name), store_name + + raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.") + + +# ============================================================================= +# Auto-register built-in types +# ============================================================================= + +# Import builtin_types module to register built-in types (DJBlobType, ContentType, etc.) +# This import has a side effect: it registers the types via @register_type decorators +from . import builtin_types as _builtin_types # noqa: F401, E402 diff --git a/src/datajoint/blob.py b/src/datajoint/blob.py index 424d88779..15364bfa4 100644 --- a/src/datajoint/blob.py +++ b/src/datajoint/blob.py @@ -56,8 +56,6 @@ compression = {b"ZL123\0": zlib.decompress} -bypass_serialization = False # runtime setting to bypass blob (en|de)code - # runtime setting to read integers as 32-bit to read blobs created by the 32-bit # version of the mYm library for MATLAB use_32bit_dims = False @@ -507,17 +505,9 @@ def pack(self, obj, compress): def pack(obj, compress=True): - if bypass_serialization: - # provide a way to move blobs quickly without de/serialization - assert isinstance(obj, bytes) and obj.startswith((b"ZL123\0", b"mYm\0", b"dj0\0")) - return obj return Blob().pack(obj, compress=compress) def unpack(blob, squeeze=False): - if bypass_serialization: - # provide a way to move blobs quickly without de/serialization - assert isinstance(blob, bytes) and blob.startswith((b"ZL123\0", b"mYm\0", b"dj0\0")) - return blob if blob is not None: return Blob(squeeze=squeeze).unpack(blob) diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_types.py new file mode 100644 index 000000000..bb2bb20a6 --- /dev/null +++ b/src/datajoint/builtin_types.py @@ -0,0 +1,746 @@ +""" +Built-in DataJoint attribute types. + +This module defines the standard AttributeTypes that ship with DataJoint. +These serve as both useful built-in types and as examples for users who +want to create their own custom types. + +Built-in Types: + - ````: Serialize Python objects to DataJoint's blob format (internal storage) + - ````: Content-addressed storage with SHA256 deduplication + - ````: External serialized blobs using content-addressed storage + - ````: Path-addressed storage for files/folders (Zarr, HDF5) + - ````: Internal file attachment stored in database + - ````: External file attachment with deduplication + - ````: Reference to existing file in store + +Example - Creating a Custom Type: + Here's how to define your own AttributeType, modeled after the built-in types:: + + import datajoint as dj + import networkx as nx + + @dj.register_type + class GraphType(dj.AttributeType): + '''Store NetworkX graphs as edge lists.''' + + type_name = "graph" # Use as in definitions + dtype = "" # Compose with djblob for serialization + + def encode(self, graph, *, key=None, store_name=None): + # Convert graph to a serializable format + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + # Reconstruct graph from stored format + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + + def validate(self, value): + if not isinstance(value, nx.Graph): + raise TypeError(f"Expected nx.Graph, got {type(value).__name__}") + + # Now use in table definitions: + @schema + class Networks(dj.Manual): + definition = ''' + network_id : int + --- + topology : + ''' +""" + +from __future__ import annotations + +from typing import Any + +from .attribute_type import AttributeType, register_type + + +# ============================================================================= +# DJBlob Types - DataJoint's native serialization +# ============================================================================= + + +@register_type +class DJBlobType(AttributeType): + """ + Serialize Python objects using DataJoint's blob format. + + The ```` type handles serialization of arbitrary Python objects + including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs. + Data is stored in a MySQL ``LONGBLOB`` column. + + Format Features: + - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) + - Optional zlib compression for data > 1KB + - Support for nested structures + + Example:: + + @schema + class ProcessedData(dj.Manual): + definition = ''' + data_id : int + --- + results : # Serialized Python objects + ''' + + # Insert any serializable object + table.insert1({'data_id': 1, 'results': {'scores': [0.9, 0.8], 'labels': ['a', 'b']}}) + + Note: + Plain ``longblob`` columns store raw bytes without serialization. + Use ```` when you need automatic serialization. + """ + + type_name = "djblob" + dtype = "longblob" + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """Serialize a Python object to DataJoint's blob format.""" + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """Deserialize blob bytes back to a Python object.""" + from . import blob + + return blob.unpack(stored, squeeze=False) + + +# ============================================================================= +# Content-Addressed Storage Types +# ============================================================================= + + +@register_type +class ContentType(AttributeType): + """ + Content-addressed storage with SHA256 deduplication. + + The ```` type stores raw bytes using content-addressed storage. + Data is identified by its SHA256 hash and stored in a hierarchical directory: + ``_content/{hash[:2]}/{hash[2:4]}/{hash}`` + + The database column stores JSON metadata: ``{hash, store, size}``. + Duplicate content is automatically deduplicated. + + Example:: + + @schema + class RawContent(dj.Manual): + definition = ''' + content_id : int + --- + data : + ''' + + # Insert raw bytes + table.insert1({'content_id': 1, 'data': b'raw binary content'}) + + Note: + This type accepts only ``bytes``. For Python objects, use ````. + A store must be specified (e.g., ````) unless a default + store is configured. + """ + + type_name = "content" + dtype = "json" + + def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: + """ + Store content and return metadata. + + Args: + value: Raw bytes to store. + key: Primary key values (unused). + store_name: Store to use. If None, uses default store. + + Returns: + Metadata dict: {hash, store, size} + """ + from .content_registry import put_content + + return put_content(value, store_name=store_name) + + def decode(self, stored: dict, *, key: dict | None = None) -> bytes: + """ + Retrieve content by hash. + + Args: + stored: Metadata dict with 'hash' and optionally 'store'. + key: Primary key values (unused). + + Returns: + Original bytes. + """ + from .content_registry import get_content + + return get_content(stored["hash"], store_name=stored.get("store")) + + def validate(self, value: Any) -> None: + """Validate that value is bytes.""" + if not isinstance(value, bytes): + raise TypeError(f" expects bytes, got {type(value).__name__}") + + +@register_type +class XBlobType(AttributeType): + """ + External serialized blobs with content-addressed storage. + + The ```` type combines DataJoint's blob serialization with + content-addressed storage. Objects are serialized, then stored externally + with automatic deduplication. + + This is ideal for large objects (NumPy arrays, DataFrames) that may be + duplicated across rows. + + Example:: + + @schema + class LargeArrays(dj.Manual): + definition = ''' + array_id : int + --- + data : + ''' + + import numpy as np + table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)}) + + Type Composition: + ```` composes with ````:: + + Insert: object → blob.pack() → put_content() → JSON metadata + Fetch: JSON → get_content() → blob.unpack() → object + + Note: + - For internal storage, use ```` + - For raw bytes without serialization, use ```` + """ + + type_name = "xblob" + dtype = "" # Composition: uses ContentType + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """Serialize object to bytes (passed to ContentType).""" + from . import blob + + return blob.pack(value, compress=True) + + def decode(self, stored: bytes, *, key: dict | None = None) -> Any: + """Deserialize bytes back to Python object.""" + from . import blob + + return blob.unpack(stored, squeeze=False) + + +# ============================================================================= +# Path-Addressed Storage Types (OAS - Object-Augmented Schema) +# ============================================================================= + + +@register_type +class ObjectType(AttributeType): + """ + Path-addressed storage for files and folders. + + The ```` type provides managed file/folder storage where the path + is derived from the primary key: ``{schema}/{table}/objects/{pk}/{field}_{token}.{ext}`` + + Unlike ```` (content-addressed), each row has its own storage path, + and content is deleted when the row is deleted. This is ideal for: + + - Zarr arrays (hierarchical chunked data) + - HDF5 files + - Complex multi-file outputs + - Any content that shouldn't be deduplicated + + Example:: + + @schema + class Analysis(dj.Computed): + definition = ''' + -> Recording + --- + results : + ''' + + def make(self, key): + # Store a file + self.insert1({**key, 'results': '/path/to/results.zarr'}) + + # Fetch returns ObjectRef for lazy access + ref = (Analysis & key).fetch1('results') + ref.path # Storage path + ref.read() # Read file content + ref.fsmap # For zarr.open(ref.fsmap) + + Storage Structure: + Objects are stored at:: + + {store_root}/{schema}/{table}/objects/{pk}/{field}_{token}.ext + + The token ensures uniqueness even if content is replaced. + + Comparison with ````:: + + | Aspect | | | + |----------------|-------------------|---------------------| + | Addressing | Path (by PK) | Hash (by content) | + | Deduplication | No | Yes | + | Deletion | With row | GC when unreferenced| + | Use case | Zarr, HDF5 | Blobs, attachments | + + Note: + A store must be specified (````) unless a default store + is configured. Returns ``ObjectRef`` on fetch for lazy access. + """ + + type_name = "object" + dtype = "json" + + def encode( + self, + value: Any, + *, + key: dict | None = None, + store_name: str | None = None, + ) -> dict: + """ + Store content and return metadata. + + Args: + value: Content to store. Can be: + - bytes: Raw bytes to store as file + - str/Path: Path to local file or folder to upload + key: Dict containing context for path construction: + - _schema: Schema name + - _table: Table name + - _field: Field/attribute name + - Other entries are primary key values + store_name: Store to use. If None, uses default store. + + Returns: + Metadata dict suitable for ObjectRef.from_json() + """ + from datetime import datetime, timezone + from pathlib import Path + + from .content_registry import get_store_backend + from .storage import build_object_path + + # Extract context from key + key = key or {} + schema = key.pop("_schema", "unknown") + table = key.pop("_table", "unknown") + field = key.pop("_field", "data") + primary_key = {k: v for k, v in key.items() if not k.startswith("_")} + + # Determine content type and extension + is_dir = False + ext = None + size = None + + if isinstance(value, bytes): + content = value + size = len(content) + elif isinstance(value, (str, Path)): + source_path = Path(value) + if not source_path.exists(): + raise FileNotFoundError(f"Source path does not exist: {source_path}") + is_dir = source_path.is_dir() + ext = source_path.suffix if not is_dir else None + if is_dir: + # For directories, we'll upload later + content = None + else: + content = source_path.read_bytes() + size = len(content) + else: + raise TypeError(f" expects bytes or path, got {type(value).__name__}") + + # Build storage path + path, token = build_object_path( + schema=schema, + table=table, + field=field, + primary_key=primary_key, + ext=ext, + ) + + # Get storage backend + backend = get_store_backend(store_name) + + # Upload content + if is_dir: + # Upload directory recursively + source_path = Path(value) + backend.put_folder(str(source_path), path) + # Compute size by summing all files + size = sum(f.stat().st_size for f in source_path.rglob("*") if f.is_file()) + else: + backend.put_buffer(content, path) + + # Build metadata + timestamp = datetime.now(timezone.utc) + metadata = { + "path": path, + "store": store_name, + "size": size, + "ext": ext, + "is_dir": is_dir, + "timestamp": timestamp.isoformat(), + } + + return metadata + + def decode(self, stored: dict, *, key: dict | None = None) -> Any: + """ + Create ObjectRef handle for lazy access. + + Args: + stored: Metadata dict from database. + key: Primary key values (unused). + + Returns: + ObjectRef for accessing the stored content. + """ + from .content_registry import get_store_backend + from .objectref import ObjectRef + + store_name = stored.get("store") + backend = get_store_backend(store_name) + return ObjectRef.from_json(stored, backend=backend) + + def validate(self, value: Any) -> None: + """Validate that value is bytes or a valid path.""" + from pathlib import Path + + if isinstance(value, bytes): + return + if isinstance(value, (str, Path)): + return + raise TypeError(f" expects bytes or path, got {type(value).__name__}") + + +# ============================================================================= +# File Attachment Types +# ============================================================================= + + +@register_type +class AttachType(AttributeType): + """ + Internal file attachment stored in database. + + The ```` type stores a file directly in the database as a ``LONGBLOB``. + The filename is preserved and the file is extracted to the configured + download path on fetch. + + Example:: + + @schema + class Documents(dj.Manual): + definition = ''' + doc_id : int + --- + report : + ''' + + # Insert a file + table.insert1({'doc_id': 1, 'report': '/path/to/report.pdf'}) + + # Fetch extracts to download_path and returns local path + local_path = (table & 'doc_id=1').fetch1('report') + + Storage Format: + The blob contains: ``filename\\0contents`` + - Filename (UTF-8 encoded) + null byte + raw file contents + + Note: + - For large files, use ```` (external storage with deduplication) + - For files that shouldn't be copied, use ```` + """ + + type_name = "attach" + dtype = "longblob" + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """ + Read file and encode as filename + contents. + + Args: + value: Path to file (str or Path). + key: Primary key values (unused). + store_name: Unused for internal storage. + + Returns: + Bytes: filename (UTF-8) + null byte + file contents + """ + from pathlib import Path + + path = Path(value) + if not path.exists(): + raise FileNotFoundError(f"Attachment file not found: {path}") + if path.is_dir(): + raise IsADirectoryError(f" does not support directories: {path}") + + filename = path.name + contents = path.read_bytes() + return filename.encode("utf-8") + b"\x00" + contents + + def decode(self, stored: bytes, *, key: dict | None = None) -> str: + """ + Extract file to download path and return local path. + + Args: + stored: Blob containing filename + null + contents. + key: Primary key values (unused). + + Returns: + Path to extracted file as string. + """ + from pathlib import Path + + from .settings import config + + # Split on first null byte + null_pos = stored.index(b"\x00") + filename = stored[:null_pos].decode("utf-8") + contents = stored[null_pos + 1 :] + + # Write to download path + download_path = Path(config.get("download_path", ".")) + download_path.mkdir(parents=True, exist_ok=True) + local_path = download_path / filename + + local_path.write_bytes(contents) + return str(local_path) + + def validate(self, value: Any) -> None: + """Validate that value is a valid file path.""" + from pathlib import Path + + if not isinstance(value, (str, Path)): + raise TypeError(f" expects a file path, got {type(value).__name__}") + + +@register_type +class XAttachType(AttributeType): + """ + External file attachment with content-addressed storage. + + The ```` type stores files externally using content-addressed + storage. Like ````, the filename is preserved and the file is + extracted on fetch. Unlike ````, files are stored externally + with automatic deduplication. + + Example:: + + @schema + class LargeDocuments(dj.Manual): + definition = ''' + doc_id : int + --- + dataset : + ''' + + # Insert a large file + table.insert1({'doc_id': 1, 'dataset': '/path/to/large_file.h5'}) + + # Fetch downloads and returns local path + local_path = (table & 'doc_id=1').fetch1('dataset') + + Type Composition: + ```` composes with ````:: + + Insert: file → read + encode filename → put_content() → JSON + Fetch: JSON → get_content() → extract → local path + + Comparison:: + + | Type | Storage | Deduplication | Best for | + |------------|----------|---------------|---------------------| + | | Database | No | Small files (<16MB) | + | | External | Yes | Large files | + """ + + type_name = "xattach" + dtype = "" # Composition: uses ContentType + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: + """ + Read file and encode as filename + contents. + + Args: + value: Path to file (str or Path). + key: Primary key values (unused). + store_name: Passed to ContentType for storage. + + Returns: + Bytes: filename (UTF-8) + null byte + file contents + """ + from pathlib import Path + + path = Path(value) + if not path.exists(): + raise FileNotFoundError(f"Attachment file not found: {path}") + if path.is_dir(): + raise IsADirectoryError(f" does not support directories: {path}") + + filename = path.name + contents = path.read_bytes() + return filename.encode("utf-8") + b"\x00" + contents + + def decode(self, stored: bytes, *, key: dict | None = None) -> str: + """ + Extract file to download path and return local path. + + Args: + stored: Bytes containing filename + null + contents. + key: Primary key values (unused). + + Returns: + Path to extracted file as string. + """ + from pathlib import Path + + from .settings import config + + # Split on first null byte + null_pos = stored.index(b"\x00") + filename = stored[:null_pos].decode("utf-8") + contents = stored[null_pos + 1 :] + + # Write to download path + download_path = Path(config.get("download_path", ".")) + download_path.mkdir(parents=True, exist_ok=True) + local_path = download_path / filename + + local_path.write_bytes(contents) + return str(local_path) + + def validate(self, value: Any) -> None: + """Validate that value is a valid file path.""" + from pathlib import Path + + if not isinstance(value, (str, Path)): + raise TypeError(f" expects a file path, got {type(value).__name__}") + + +# ============================================================================= +# Filepath Reference Type +# ============================================================================= + + +@register_type +class FilepathType(AttributeType): + """ + Reference to existing file in configured store. + + The ```` type stores a reference to a file that already + exists in the storage backend. Unlike ```` or ````, no + file copying occurs - only the path is recorded. + + This is useful when: + - Files are managed externally (e.g., by acquisition software) + - Files are too large to copy + - You want to reference shared datasets + + Example:: + + @schema + class Recordings(dj.Manual): + definition = ''' + recording_id : int + --- + raw_data : + ''' + + # Reference an existing file (no copy) + table.insert1({'recording_id': 1, 'raw_data': 'subject01/session001/data.bin'}) + + # Fetch returns ObjectRef for lazy access + ref = (table & 'recording_id=1').fetch1('raw_data') + ref.read() # Read file content + ref.download() # Download to local path + + Storage Format: + JSON metadata: ``{path, store}`` + + Warning: + The file must exist in the store at the specified path. + DataJoint does not manage the lifecycle of referenced files. + """ + + type_name = "filepath" + dtype = "json" + + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> dict: + """ + Store path reference as JSON metadata. + + Args: + value: Relative path within the store (str). + key: Primary key values (unused). + store_name: Store where the file exists. + + Returns: + Metadata dict: {path, store} + """ + from datetime import datetime, timezone + + from .content_registry import get_store_backend + + path = str(value) + + # Optionally verify file exists + backend = get_store_backend(store_name) + if not backend.exists(path): + raise FileNotFoundError(f"File not found in store '{store_name or 'default'}': {path}") + + # Get file info + try: + size = backend.size(path) + except Exception: + size = None + + return { + "path": path, + "store": store_name, + "size": size, + "is_dir": False, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + + def decode(self, stored: dict, *, key: dict | None = None) -> Any: + """ + Create ObjectRef handle for lazy access. + + Args: + stored: Metadata dict with path and store. + key: Primary key values (unused). + + Returns: + ObjectRef for accessing the file. + """ + from .content_registry import get_store_backend + from .objectref import ObjectRef + + store_name = stored.get("store") + backend = get_store_backend(store_name) + return ObjectRef.from_json(stored, backend=backend) + + def validate(self, value: Any) -> None: + """Validate that value is a path string.""" + if not isinstance(value, str): + raise TypeError(f" expects a path string, got {type(value).__name__}") diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py new file mode 100644 index 000000000..01e5844cf --- /dev/null +++ b/src/datajoint/content_registry.py @@ -0,0 +1,189 @@ +""" +Content-addressed storage registry for DataJoint. + +This module provides content-addressed storage with deduplication for the +AttributeType. Content is identified by its SHA256 hash and stored in a hierarchical +directory structure: _content/{hash[:2]}/{hash[2:4]}/{hash} + +The ContentRegistry tracks stored content for garbage collection purposes. +""" + +import hashlib +import logging +from typing import Any + +from .errors import DataJointError +from .settings import config +from .storage import StorageBackend + +logger = logging.getLogger(__name__.split(".")[0]) + + +def compute_content_hash(data: bytes) -> str: + """ + Compute SHA256 hash of content. + + Args: + data: Content bytes + + Returns: + Hex-encoded SHA256 hash (64 characters) + """ + return hashlib.sha256(data).hexdigest() + + +def build_content_path(content_hash: str) -> str: + """ + Build the storage path for content-addressed storage. + + Content is stored in a hierarchical structure to avoid too many files + in a single directory: _content/{hash[:2]}/{hash[2:4]}/{hash} + + Args: + content_hash: SHA256 hex hash (64 characters) + + Returns: + Relative path within the store + """ + if len(content_hash) != 64: + raise DataJointError(f"Invalid content hash length: {len(content_hash)} (expected 64)") + return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + + +def get_store_backend(store_name: str | None = None) -> StorageBackend: + """ + Get a StorageBackend for content storage. + + Args: + store_name: Name of the store to use. If None, uses the default store. + + Returns: + StorageBackend instance + """ + if store_name is None: + # Use default store from object_storage settings + store_name = config.object_storage.default_store + if store_name is None: + raise DataJointError( + "No default store configured. Set object_storage.default_store " "or specify a store name explicitly." + ) + + spec = config.get_object_store_spec(store_name) + return StorageBackend(spec) + + +def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: + """ + Store content using content-addressed storage. + + If the content already exists (same hash), it is not re-uploaded. + Returns metadata including the hash, store, and size. + + Args: + data: Content bytes to store + store_name: Name of the store. If None, uses default store. + + Returns: + Metadata dict with keys: hash, store, size + """ + content_hash = compute_content_hash(data) + path = build_content_path(content_hash) + + backend = get_store_backend(store_name) + + # Check if content already exists (deduplication) + if not backend.exists(path): + backend.put_buffer(data, path) + logger.debug(f"Stored new content: {content_hash[:16]}... ({len(data)} bytes)") + else: + logger.debug(f"Content already exists: {content_hash[:16]}...") + + return { + "hash": content_hash, + "store": store_name, + "size": len(data), + } + + +def get_content(content_hash: str, store_name: str | None = None) -> bytes: + """ + Retrieve content by its hash. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + Content bytes + + Raises: + MissingExternalFile: If content is not found + DataJointError: If hash verification fails + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + + data = backend.get_buffer(path) + + # Verify hash (optional but recommended for integrity) + actual_hash = compute_content_hash(data) + if actual_hash != content_hash: + raise DataJointError(f"Content hash mismatch: expected {content_hash[:16]}..., " f"got {actual_hash[:16]}...") + + return data + + +def content_exists(content_hash: str, store_name: str | None = None) -> bool: + """ + Check if content exists in storage. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + True if content exists + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + return backend.exists(path) + + +def delete_content(content_hash: str, store_name: str | None = None) -> bool: + """ + Delete content from storage. + + WARNING: This should only be called after verifying no references exist. + Use garbage collection to safely remove unreferenced content. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + True if content was deleted, False if it didn't exist + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + + if backend.exists(path): + backend.remove(path) + logger.debug(f"Deleted content: {content_hash[:16]}...") + return True + return False + + +def get_content_size(content_hash: str, store_name: str | None = None) -> int: + """ + Get the size of stored content. + + Args: + content_hash: SHA256 hex hash of the content + store_name: Name of the store. If None, uses default store. + + Returns: + Size in bytes + """ + path = build_content_path(content_hash) + backend = get_store_backend(store_name) + return backend.size(path) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index a1613d7d2..68286de2c 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -9,95 +9,88 @@ import pyparsing as pp -from .attribute_adapter import get_adapter +from .attribute_type import get_adapter from .condition import translate_attribute -from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types +from .errors import DataJointError from .settings import config -UUID_DATA_TYPE = "binary(16)" - -# Type aliases for numeric types -SQL_TYPE_ALIASES = { - "FLOAT32": "float", - "FLOAT64": "double", - "INT64": "bigint", - "UINT64": "bigint unsigned", - "INT32": "int", - "UINT32": "int unsigned", - "INT16": "smallint", - "UINT16": "smallint unsigned", - "INT8": "tinyint", - "UINT8": "tinyint unsigned", - "BOOL": "tinyint", +# Core DataJoint types - scientist-friendly names that are fully supported +# These are recorded in field comments using :type: syntax for reconstruction +# Format: pattern_name -> (regex_pattern, mysql_type or None if same as matched) +CORE_TYPES = { + # Numeric types (aliased to native SQL) + "float32": (r"float32$", "float"), + "float64": (r"float64$", "double"), + "int64": (r"int64$", "bigint"), + "uint64": (r"uint64$", "bigint unsigned"), + "int32": (r"int32$", "int"), + "uint32": (r"uint32$", "int unsigned"), + "int16": (r"int16$", "smallint"), + "uint16": (r"uint16$", "smallint unsigned"), + "int8": (r"int8$", "tinyint"), + "uint8": (r"uint8$", "tinyint unsigned"), + "bool": (r"bool$", "tinyint"), + # UUID (stored as binary) + "uuid": (r"uuid$", "binary(16)"), + # JSON + "json": (r"json$", None), # json passes through as-is + # Binary (blob maps to longblob) + "blob": (r"blob$", "longblob"), + # Temporal + "date": (r"date$", None), + "datetime": (r"datetime$", None), + # String types (with parameters) + "char": (r"char\s*\(\d+\)$", None), + "varchar": (r"varchar\s*\(\d+\)$", None), + # Enumeration + "enum": (r"enum\s*\(.+\)$", None), } + +# Compile core type patterns +CORE_TYPE_PATTERNS = {name: re.compile(pattern, re.I) for name, (pattern, _) in CORE_TYPES.items()} + +# Get SQL mapping for core types +CORE_TYPE_SQL = {name: sql_type for name, (_, sql_type) in CORE_TYPES.items()} + MAX_TABLE_NAME_LENGTH = 64 CONSTANT_LITERALS = { "CURRENT_TIMESTAMP", "NULL", } # SQL literals to be used without quotes (case insensitive) -EXTERNAL_TABLE_ROOT = "~external" +# Type patterns for declaration parsing TYPE_PATTERN = { k: re.compile(v, re.I) for k, v in dict( - # Type aliases must come before INTEGER and FLOAT patterns to avoid prefix matching - FLOAT32=r"float32$", - FLOAT64=r"float64$", - INT64=r"int64$", - UINT64=r"uint64$", - INT32=r"int32$", - UINT32=r"uint32$", - INT16=r"int16$", - UINT16=r"uint16$", - INT8=r"int8$", - UINT8=r"uint8$", - BOOL=r"bool$", # aliased to tinyint - # Native MySQL types + # Core DataJoint types + **{name.upper(): pattern for name, (pattern, _) in CORE_TYPES.items()}, + # Native SQL types (passthrough with warning for non-standard use) INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$", DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$", FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$", - STRING=r"(var)?char\s*\(.+\)$", - JSON=r"json$", - ENUM=r"enum\s*\(.+\)$", - TEMPORAL=r"(date|datetime|time|timestamp|year)(\s*\(.+\))?$", - INTERNAL_BLOB=r"(tiny|small|medium|long|)blob$", - EXTERNAL_BLOB=r"blob@(?P[a-z][\-\w]*)$", - INTERNAL_ATTACH=r"attach$", - EXTERNAL_ATTACH=r"attach@(?P[a-z][\-\w]*)$", - FILEPATH=r"filepath@(?P[a-z][\-\w]*)$", - OBJECT=r"object(@(?P[a-z][\-\w]*))?$", # managed object storage (files/folders) - UUID=r"uuid$", + STRING=r"(var)?char\s*\(.+\)$", # Catches char/varchar not matched by core types + TEMPORAL=r"(time|timestamp|year)(\s*\(.+\))?$", # time, timestamp, year (not date/datetime) + NATIVE_BLOB=r"(tiny|small|medium|long)blob$", # Specific blob variants + TEXT=r"(tiny|small|medium|long)?text$", # Text types + # AttributeTypes use angle brackets ADAPTED=r"<.+>$", ).items() } -# custom types are stored in attribute comment -SPECIAL_TYPES = { - "UUID", - "INTERNAL_ATTACH", - "EXTERNAL_ATTACH", - "EXTERNAL_BLOB", - "FILEPATH", - "OBJECT", - "ADAPTED", -} | set(SQL_TYPE_ALIASES) +# Core types are stored in attribute comment for reconstruction +CORE_TYPE_NAMES = {name.upper() for name in CORE_TYPES} + +# Special types that need comment storage (core types + adapted) +SPECIAL_TYPES = CORE_TYPE_NAMES | {"ADAPTED"} + +# Native SQL types that pass through (with optional warning) NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES -EXTERNAL_TYPES = { - "EXTERNAL_ATTACH", - "EXTERNAL_BLOB", - "FILEPATH", -} # data referenced by a UUID in external tables -SERIALIZED_TYPES = { - "EXTERNAL_ATTACH", - "INTERNAL_ATTACH", - "EXTERNAL_BLOB", - "INTERNAL_BLOB", -} # requires packing data -assert set().union(SPECIAL_TYPES, EXTERNAL_TYPES, SERIALIZED_TYPES) <= set(TYPE_PATTERN) +assert SPECIAL_TYPES <= set(TYPE_PATTERN) def match_type(attribute_type): + """Match an attribute type string to a category.""" try: return next(category for category, pattern in TYPE_PATTERN.items() if pattern.match(attribute_type)) except StopIteration: @@ -458,47 +451,36 @@ def format_attribute(attr): def substitute_special_type(match, category, foreign_key_sql, context): """ + Substitute special types with their native SQL equivalents. + + Special types are: + - Core DataJoint types (float32 → float, uuid → binary(16), blob → longblob, etc.) + - ADAPTED types (AttributeTypes in angle brackets) + :param match: dict containing with keys "type" and "comment" -- will be modified in place :param category: attribute type category from TYPE_PATTERN :param foreign_key_sql: list of foreign key declarations to add to :param context: context for looking up user-defined attribute_type adapters """ - if category == "UUID": - match["type"] = UUID_DATA_TYPE - elif category == "INTERNAL_ATTACH": - match["type"] = "LONGBLOB" - elif category == "OBJECT": - # Object type stores metadata as JSON - no foreign key to external table - # Extract store name if present (object@store_name syntax) - if "@" in match["type"]: - match["store"] = match["type"].split("@", 1)[1] - match["type"] = "JSON" - elif category in EXTERNAL_TYPES: - if category == "FILEPATH" and not _support_filepath_types(): - raise DataJointError( - """ - The filepath data type is disabled until complete validation. - To turn it on as experimental feature, set the environment variable - {env} = TRUE or upgrade datajoint. - """.format(env=FILEPATH_FEATURE_SWITCH) - ) - match["store"] = match["type"].split("@", 1)[1] - match["type"] = UUID_DATA_TYPE - foreign_key_sql.append( - "FOREIGN KEY (`{name}`) REFERENCES `{{database}}`.`{external_table_root}_{store}` (`hash`) " - "ON UPDATE RESTRICT ON DELETE RESTRICT".format(external_table_root=EXTERNAL_TABLE_ROOT, **match) - ) - elif category == "ADAPTED": - adapter = get_adapter(context, match["type"]) - match["type"] = adapter.attribute_type + if category == "ADAPTED": + # AttributeType - resolve to underlying dtype + attr_type, store_name = get_adapter(context, match["type"]) + if store_name is not None: + match["store"] = store_name + match["type"] = attr_type.dtype + # Recursively resolve if dtype is also a special type category = match_type(match["type"]) if category in SPECIAL_TYPES: - # recursive redefinition from user-defined datatypes. substitute_special_type(match, category, foreign_key_sql, context) - elif category in SQL_TYPE_ALIASES: - match["type"] = SQL_TYPE_ALIASES[category] + elif category in CORE_TYPE_NAMES: + # Core DataJoint type - substitute with native SQL type if mapping exists + core_name = category.lower() + sql_type = CORE_TYPE_SQL.get(core_name) + if sql_type is not None: + match["type"] = sql_type + # else: type passes through as-is (json, date, datetime, char, varchar, enum) else: - assert False, "Unknown special type" + assert False, f"Unknown special type: {category}" def compile_attribute(line, in_key, foreign_key_sql, context): @@ -509,7 +491,7 @@ def compile_attribute(line, in_key, foreign_key_sql, context): :param in_key: set to True if attribute is in primary key set :param foreign_key_sql: the list of foreign key declarations to add to :param context: context in which to look up user-defined attribute type adapterss - :returns: (name, sql, is_external) -- attribute name and sql code for its declaration + :returns: (name, sql, store) -- attribute name, sql code for its declaration, and optional store name """ try: match = attribute_parser.parseString(line + "#", parseAll=True) @@ -542,17 +524,23 @@ def compile_attribute(line, in_key, foreign_key_sql, context): raise DataJointError('An attribute comment must not start with a colon in comment "{comment}"'.format(**match)) category = match_type(match["type"]) + if category in SPECIAL_TYPES: - match["comment"] = ":{type}:{comment}".format(**match) # insert custom type into comment + # Core types and AttributeTypes are recorded in comment for reconstruction + match["comment"] = ":{type}:{comment}".format(**match) substitute_special_type(match, category, foreign_key_sql, context) - - if category in SERIALIZED_TYPES and match["default"] not in { - "DEFAULT NULL", - "NOT NULL", - }: - raise DataJointError( - "The default value for a blob or attachment attributes can only be NULL in:\n{line}".format(line=line) + elif category in NATIVE_TYPES: + # Non-standard native type - warn user + logger.warning( + f"Non-standard native type '{match['type']}' in attribute '{match['name']}'. " + "Consider using a core DataJoint type for better portability." ) + # Check for invalid default values on blob types (after type substitution) + # Note: blob → longblob, so check for NATIVE_BLOB or longblob result + final_type = match["type"].lower() + if ("blob" in final_type) and match["default"] not in {"DEFAULT NULL", "NOT NULL"}: + raise DataJointError("The default value for blob attributes can only be NULL in:\n{line}".format(line=line)) + sql = ("`{name}` {type} {default}" + (' COMMENT "{comment}"' if match["comment"] else "")).format(**match) return match["name"], sql, match.get("store") diff --git a/src/datajoint/external.py b/src/datajoint/external.py deleted file mode 100644 index 06e76af37..000000000 --- a/src/datajoint/external.py +++ /dev/null @@ -1,452 +0,0 @@ -import logging -import warnings -from collections.abc import Mapping -from pathlib import Path, PurePosixPath, PureWindowsPath - -from tqdm import tqdm - -from .declare import EXTERNAL_TABLE_ROOT -from .errors import DataJointError, MissingExternalFile -from .hash import uuid_from_buffer, uuid_from_file -from .heading import Heading -from .settings import config -from .storage import StorageBackend -from .table import FreeTable, Table -from .utils import safe_write - -logger = logging.getLogger(__name__.split(".")[0]) - -CACHE_SUBFOLDING = ( - 2, - 2, -) # (2, 2) means "0123456789abcd" will be saved as "01/23/0123456789abcd" -SUPPORT_MIGRATED_BLOBS = True # support blobs migrated from datajoint 0.11.* - - -def subfold(name, folds): - """ - subfolding for external storage: e.g. subfold('aBCdefg', (2, 3)) --> ['ab','cde'] - """ - return (name[: folds[0]].lower(),) + subfold(name[folds[0] :], folds[1:]) if folds else () - - -class ExternalTable(Table): - """ - The table tracking externally stored objects. - Declare as ExternalTable(connection, database) - """ - - def __init__(self, connection, store, database): - self.store = store - self.database = database - self._connection = connection - self._heading = Heading( - table_info=dict( - conn=connection, - database=database, - table_name=self.table_name, - context=None, - ) - ) - self._support = [self.full_table_name] - if not self.is_declared: - self.declare() - # Initialize storage backend (validates configuration) - self.storage = StorageBackend(config.get_store_spec(store)) - - @property - def definition(self): - return """ - # external storage tracking - hash : uuid # hash of contents (blob), of filename + contents (attach), or relative filepath (filepath) - --- - size :bigint unsigned # size of object in bytes - attachment_name=null : varchar(255) # the filename of an attachment - filepath=null : varchar(1000) # relative filepath or attachment filename - contents_hash=null : uuid # used for the filepath datatype - timestamp=CURRENT_TIMESTAMP :timestamp # automatic timestamp - """ - - @property - def table_name(self): - return f"{EXTERNAL_TABLE_ROOT}_{self.store}" - - @property - def s3(self): - """Deprecated: Use storage property instead.""" - warnings.warn( - "ExternalTable.s3 is deprecated. Use ExternalTable.storage instead.", - DeprecationWarning, - stacklevel=2, - ) - # For backward compatibility, return a legacy s3.Folder if needed - from . import s3 - - if not hasattr(self, "_s3_legacy") or self._s3_legacy is None: - self._s3_legacy = s3.Folder(**self.storage.spec) - return self._s3_legacy - - # - low-level operations - private - - def _make_external_filepath(self, relative_filepath): - """resolve the complete external path based on the relative path""" - spec = self.storage.spec - # Strip root for S3 paths - if spec["protocol"] == "s3": - posix_path = PurePosixPath(PureWindowsPath(spec["location"])) - location_path = ( - Path(*posix_path.parts[1:]) - if len(spec["location"]) > 0 and any(case in posix_path.parts[0] for case in ("\\", ":")) - else Path(posix_path) - ) - return PurePosixPath(location_path, relative_filepath) - # Preserve root for local filesystem - elif spec["protocol"] == "file": - return PurePosixPath(Path(spec["location"]), relative_filepath) - else: - # For other protocols (gcs, azure, etc.), treat like S3 - location = spec.get("location", "") - return PurePosixPath(location, relative_filepath) if location else PurePosixPath(relative_filepath) - - def _make_uuid_path(self, uuid, suffix=""): - """create external path based on the uuid hash""" - return self._make_external_filepath( - PurePosixPath( - self.database, - "/".join(subfold(uuid.hex, self.storage.spec["subfolding"])), - uuid.hex, - ).with_suffix(suffix) - ) - - def _upload_file(self, local_path, external_path, metadata=None): - """Upload a file to external storage using fsspec backend.""" - self.storage.put_file(local_path, external_path, metadata) - - def _download_file(self, external_path, download_path): - """Download a file from external storage using fsspec backend.""" - self.storage.get_file(external_path, download_path) - - def _upload_buffer(self, buffer, external_path): - """Upload bytes to external storage using fsspec backend.""" - self.storage.put_buffer(buffer, external_path) - - def _download_buffer(self, external_path): - """Download bytes from external storage using fsspec backend.""" - return self.storage.get_buffer(external_path) - - def _remove_external_file(self, external_path): - """Remove a file from external storage using fsspec backend.""" - self.storage.remove(external_path) - - def exists(self, external_filepath): - """ - Check if an external file is accessible using fsspec backend. - - :return: True if the external file is accessible - """ - return self.storage.exists(external_filepath) - - # --- BLOBS ---- - - def put(self, blob): - """ - put a binary string (blob) in external store - """ - uuid = uuid_from_buffer(blob) - self._upload_buffer(blob, self._make_uuid_path(uuid)) - # insert tracking info - self.connection.query( - "INSERT INTO {tab} (hash, size) VALUES (%s, {size}) ON DUPLICATE KEY UPDATE timestamp=CURRENT_TIMESTAMP".format( - tab=self.full_table_name, size=len(blob) - ), - args=(uuid.bytes,), - ) - return uuid - - def get(self, uuid): - """ - get an object from external store. - """ - if uuid is None: - return None - # attempt to get object from cache - blob = None - cache_folder = config.get("cache", None) - if cache_folder: - try: - cache_path = Path(cache_folder, *subfold(uuid.hex, CACHE_SUBFOLDING)) - cache_file = Path(cache_path, uuid.hex) - blob = cache_file.read_bytes() - except FileNotFoundError: - pass # not cached - # download blob from external store - if blob is None: - try: - blob = self._download_buffer(self._make_uuid_path(uuid)) - except MissingExternalFile: - if not SUPPORT_MIGRATED_BLOBS: - raise - # blobs migrated from datajoint 0.11 are stored at explicitly defined filepaths - relative_filepath, contents_hash = (self & {"hash": uuid}).fetch1("filepath", "contents_hash") - if relative_filepath is None: - raise - blob = self._download_buffer(self._make_external_filepath(relative_filepath)) - if cache_folder: - cache_path.mkdir(parents=True, exist_ok=True) - safe_write(cache_path / uuid.hex, blob) - return blob - - # --- ATTACHMENTS --- - - def upload_attachment(self, local_path): - attachment_name = Path(local_path).name - uuid = uuid_from_file(local_path, init_string=attachment_name + "\0") - external_path = self._make_uuid_path(uuid, "." + attachment_name) - self._upload_file(local_path, external_path) - # insert tracking info - self.connection.query( - """ - INSERT INTO {tab} (hash, size, attachment_name) - VALUES (%s, {size}, "{attachment_name}") - ON DUPLICATE KEY UPDATE timestamp=CURRENT_TIMESTAMP""".format( - tab=self.full_table_name, - size=Path(local_path).stat().st_size, - attachment_name=attachment_name, - ), - args=[uuid.bytes], - ) - return uuid - - def get_attachment_name(self, uuid): - return (self & {"hash": uuid}).fetch1("attachment_name") - - def download_attachment(self, uuid, attachment_name, download_path): - """save attachment from memory buffer into the save_path""" - external_path = self._make_uuid_path(uuid, "." + attachment_name) - self._download_file(external_path, download_path) - - # --- FILEPATH --- - - def upload_filepath(self, local_filepath): - """ - Raise exception if an external entry already exists with a different contents checksum. - Otherwise, copy (with overwrite) file to remote and - If an external entry exists with the same checksum, then no copying should occur - """ - local_filepath = Path(local_filepath) - try: - relative_filepath = str(local_filepath.relative_to(self.storage.spec["stage"]).as_posix()) - except ValueError: - raise DataJointError(f"The path {local_filepath.parent} is not in stage {self.storage.spec['stage']}") - uuid = uuid_from_buffer(init_string=relative_filepath) # hash relative path, not contents - contents_hash = uuid_from_file(local_filepath) - - # check if the remote file already exists and verify that it matches - check_hash = (self & {"hash": uuid}).fetch("contents_hash") - if check_hash.size: - # the tracking entry exists, check that it's the same file as before - if contents_hash != check_hash[0]: - raise DataJointError(f"A different version of '{relative_filepath}' has already been placed.") - else: - # upload the file and create its tracking entry - self._upload_file( - local_filepath, - self._make_external_filepath(relative_filepath), - metadata={"contents_hash": str(contents_hash)}, - ) - self.connection.query( - "INSERT INTO {tab} (hash, size, filepath, contents_hash) VALUES (%s, {size}, '{filepath}', %s)".format( - tab=self.full_table_name, - size=Path(local_filepath).stat().st_size, - filepath=relative_filepath, - ), - args=(uuid.bytes, contents_hash.bytes), - ) - return uuid - - def download_filepath(self, filepath_hash): - """ - sync a file from external store to the local stage - - :param filepath_hash: The hash (UUID) of the relative_path - :return: hash (UUID) of the contents of the downloaded file or Nones - """ - - def _need_checksum(local_filepath, expected_size): - limit = config.get("filepath_checksum_size_limit") - actual_size = Path(local_filepath).stat().st_size - if expected_size != actual_size: - # this should never happen without outside interference - raise DataJointError(f"'{local_filepath}' downloaded but size did not match.") - return limit is None or actual_size < limit - - if filepath_hash is not None: - relative_filepath, contents_hash, size = (self & {"hash": filepath_hash}).fetch1( - "filepath", "contents_hash", "size" - ) - external_path = self._make_external_filepath(relative_filepath) - local_filepath = Path(self.storage.spec["stage"]).absolute() / relative_filepath - - file_exists = Path(local_filepath).is_file() and ( - not _need_checksum(local_filepath, size) or uuid_from_file(local_filepath) == contents_hash - ) - - if not file_exists: - self._download_file(external_path, local_filepath) - if _need_checksum(local_filepath, size) and uuid_from_file(local_filepath) != contents_hash: - # this should never happen without outside interference - raise DataJointError(f"'{local_filepath}' downloaded but did not pass checksum.") - if not _need_checksum(local_filepath, size): - logger.warning(f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}") - return str(local_filepath), contents_hash - - # --- UTILITIES --- - - @property - def references(self): - """ - :return: generator of referencing table names and their referencing columns - """ - return ( - {k.lower(): v for k, v in elem.items()} - for elem in self.connection.query( - """ - SELECT concat('`', table_schema, '`.`', table_name, '`') as referencing_table, column_name - FROM information_schema.key_column_usage - WHERE referenced_table_name="{tab}" and referenced_table_schema="{db}" - """.format(tab=self.table_name, db=self.database), - as_dict=True, - ) - ) - - def fetch_external_paths(self, **fetch_kwargs): - """ - generate complete external filepaths from the query. - Each element is a tuple: (uuid, path) - - :param fetch_kwargs: keyword arguments to pass to fetch - """ - fetch_kwargs.update(as_dict=True) - paths = [] - for item in self.fetch("hash", "attachment_name", "filepath", **fetch_kwargs): - if item["attachment_name"]: - # attachments - path = self._make_uuid_path(item["hash"], "." + item["attachment_name"]) - elif item["filepath"]: - # external filepaths - path = self._make_external_filepath(item["filepath"]) - else: - # blobs - path = self._make_uuid_path(item["hash"]) - paths.append((item["hash"], path)) - return paths - - def unused(self): - """ - query expression for unused hashes - - :return: self restricted to elements that are not in use by any tables in the schema - """ - return self - [ - FreeTable(self.connection, ref["referencing_table"]).proj(hash=ref["column_name"]) for ref in self.references - ] - - def used(self): - """ - query expression for used hashes - - :return: self restricted to elements that in use by tables in the schema - """ - return self & [ - FreeTable(self.connection, ref["referencing_table"]).proj(hash=ref["column_name"]) for ref in self.references - ] - - def delete( - self, - *, - delete_external_files=None, - limit=None, - display_progress=True, - errors_as_string=True, - ): - """ - - :param delete_external_files: True or False. If False, only the tracking info is removed from the external - store table but the external files remain intact. If True, then the external files themselves are deleted too. - :param errors_as_string: If True any errors returned when deleting from external files will be strings - :param limit: (integer) limit the number of items to delete - :param display_progress: if True, display progress as files are cleaned up - :return: if deleting external files, returns errors - """ - if delete_external_files not in (True, False): - raise DataJointError("The delete_external_files argument must be set to either True or False in delete()") - - if not delete_external_files: - self.unused().delete_quick() - else: - items = self.unused().fetch_external_paths(limit=limit) - if display_progress: - items = tqdm(items) - # delete items one by one, close to transaction-safe - error_list = [] - for uuid, external_path in items: - row = (self & {"hash": uuid}).fetch() - if row.size: - try: - (self & {"hash": uuid}).delete_quick() - except Exception: - pass # if delete failed, do not remove the external file - else: - try: - self._remove_external_file(external_path) - except Exception as error: - # adding row back into table after failed delete - self.insert1(row[0], skip_duplicates=True) - error_list.append( - ( - uuid, - external_path, - str(error) if errors_as_string else error, - ) - ) - return error_list - - -class ExternalMapping(Mapping): - """ - The external manager contains all the tables for all external stores for a given schema - :Example: - e = ExternalMapping(schema) - external_table = e[store] - """ - - def __init__(self, schema): - self.schema = schema - self._tables = {} - - def __repr__(self): - return "External file tables for schema `{schema}`:\n ".format(schema=self.schema.database) + "\n ".join( - '"{store}" {protocol}:{location}'.format(store=k, **v.spec) for k, v in self.items() - ) - - def __getitem__(self, store): - """ - Triggers the creation of an external table. - Should only be used when ready to save or read from external storage. - - :param store: the name of the store - :return: the ExternalTable object for the store - """ - if store not in self._tables: - self._tables[store] = ExternalTable( - connection=self.schema.connection, - store=store, - database=self.schema.database, - ) - return self._tables[store] - - def __len__(self): - return len(self._tables) - - def __iter__(self): - return iter(self._tables) diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 3dab1f38b..d021a87d8 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -1,21 +1,15 @@ -import itertools import json import numbers -import uuid +import uuid as uuid_module from functools import partial -from pathlib import Path import numpy as np import pandas from datajoint.condition import Top -from . import blob, hash from .errors import DataJointError -from .objectref import ObjectRef from .settings import config -from .storage import StorageBackend -from .utils import safe_write class key: @@ -39,79 +33,60 @@ def to_dicts(recarray): def _get(connection, attr, data, squeeze, download_path): """ - This function is called for every attribute + Retrieve and decode attribute data from the database. + + In the simplified type system: + - Native types pass through unchanged + - JSON types are parsed + - UUID types are converted from bytes + - Blob types return raw bytes (unless an adapter handles them) + - Adapters (AttributeTypes) handle all custom encoding/decoding via type chains + + For composed types (e.g., using ), decoders are applied + in reverse order: innermost first, then outermost. :param connection: a dj.Connection object - :param attr: attribute name from the table's heading - :param data: literal value fetched from the table - :param squeeze: if True squeeze blobs - :param download_path: for fetches that download data, e.g. attachments - :return: unpacked data + :param attr: attribute from the table's heading + :param data: raw value fetched from the database + :param squeeze: if True squeeze blobs (legacy, unused) + :param download_path: for fetches that download data (legacy, unused in simplified model) + :return: decoded data """ if data is None: - return - if attr.is_object: - # Object type - return ObjectRef handle - json_data = json.loads(data) if isinstance(data, str) else data - # Get the correct backend based on store name in metadata - store_name = json_data.get("store") # None for default store - try: - spec = config.get_object_store_spec(store_name) - backend = StorageBackend(spec) - except DataJointError: - backend = None - return ObjectRef.from_json(json_data, backend=backend) + return None + + # Get the final storage type and type chain if adapter present + if attr.adapter: + from .attribute_type import resolve_dtype + + final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") + + # First, process the final dtype (what's stored in the database) + if final_dtype.lower() == "json": + data = json.loads(data) + elif final_dtype.lower() in ("longblob", "blob", "mediumblob", "tinyblob"): + pass # Blob data is already bytes + elif final_dtype.lower() == "binary(16)": + data = uuid_module.UUID(bytes=data) + + # Apply decoders in reverse order: innermost first, then outermost + for attr_type in reversed(type_chain): + data = attr_type.decode(data, key=None) + + return data + + # No adapter - handle native types if attr.json: return json.loads(data) - extern = connection.schemas[attr.database].external[attr.store] if attr.is_external else None - - # apply attribute adapter if present - adapt = attr.adapter.get if attr.adapter else lambda x: x - - if attr.is_filepath: - return adapt(extern.download_filepath(uuid.UUID(bytes=data))[0]) - if attr.is_attachment: - # Steps: - # 1. get the attachment filename - # 2. check if the file already exists at download_path, verify checksum - # 3. if exists and checksum passes then return the local filepath - # 4. Otherwise, download the remote file and return the new filepath - _uuid = uuid.UUID(bytes=data) if attr.is_external else None - attachment_name = extern.get_attachment_name(_uuid) if attr.is_external else data.split(b"\0", 1)[0].decode() - local_filepath = Path(download_path) / attachment_name - if local_filepath.is_file(): - attachment_checksum = _uuid if attr.is_external else hash.uuid_from_buffer(data) - if attachment_checksum == hash.uuid_from_file(local_filepath, init_string=attachment_name + "\0"): - return adapt(str(local_filepath)) # checksum passed, no need to download again - # generate the next available alias filename - for n in itertools.count(): - f = local_filepath.parent / (local_filepath.stem + "_%04x" % n + local_filepath.suffix) - if not f.is_file(): - local_filepath = f - break - if attachment_checksum == hash.uuid_from_file(f, init_string=attachment_name + "\0"): - return adapt(str(f)) # checksum passed, no need to download again - # Save attachment - if attr.is_external: - extern.download_attachment(_uuid, attachment_name, local_filepath) - else: - # write from buffer - safe_write(local_filepath, data.split(b"\0", 1)[1]) - return adapt(str(local_filepath)) # download file from remote store - - return adapt( - uuid.UUID(bytes=data) - if attr.uuid - else ( - blob.unpack( - extern.get(uuid.UUID(bytes=data)) if attr.is_external else data, - squeeze=squeeze, - ) - if attr.is_blob - else data - ) - ) + if attr.uuid: + return uuid_module.UUID(bytes=data) + + if attr.is_blob: + return data # raw bytes + + # Native types - pass through unchanged + return data class Fetch: diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py new file mode 100644 index 000000000..e0b7aaafe --- /dev/null +++ b/src/datajoint/gc.py @@ -0,0 +1,591 @@ +""" +Garbage collection for external storage. + +This module provides utilities to identify and remove orphaned content +from external storage. Content becomes orphaned when all database rows +referencing it are deleted. + +Supports two storage patterns: +- Content-addressed storage: , , + Stored at: _content/{hash[:2]}/{hash[2:4]}/{hash} + +- Path-addressed storage: + Stored at: {schema}/{table}/objects/{pk}/{field}_{token}/ + +Usage: + import datajoint as dj + + # Scan schemas and find orphaned content + stats = dj.gc.scan(schema1, schema2, store_name='mystore') + + # Remove orphaned content (dry_run=False to actually delete) + stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) +""" + +from __future__ import annotations + +import json +import logging +from typing import TYPE_CHECKING, Any + +from .content_registry import delete_content, get_store_backend +from .errors import DataJointError + +if TYPE_CHECKING: + from .schemas import Schema + +logger = logging.getLogger(__name__.split(".")[0]) + + +def _uses_content_storage(attr) -> bool: + """ + Check if an attribute uses content-addressed storage. + + This includes types that compose with : + - directly + - (composes with ) + - (composes with ) + + Args: + attr: Attribute from table heading + + Returns: + True if the attribute stores content hashes + """ + if not attr.adapter: + return False + + # Check if this type or its composition chain uses content storage + type_name = getattr(attr.adapter, "type_name", "") + return type_name in ("content", "xblob", "xattach") + + +def _uses_object_storage(attr) -> bool: + """ + Check if an attribute uses path-addressed object storage. + + Args: + attr: Attribute from table heading + + Returns: + True if the attribute stores object paths + """ + if not attr.adapter: + return False + + type_name = getattr(attr.adapter, "type_name", "") + return type_name == "object" + + +def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: + """ + Extract content references from a stored value. + + Args: + value: The stored value (could be JSON string or dict) + + Returns: + List of (content_hash, store_name) tuples + """ + refs = [] + + if value is None: + return refs + + # Parse JSON if string + if isinstance(value, str): + try: + value = json.loads(value) + except (json.JSONDecodeError, TypeError): + return refs + + # Extract hash from dict + if isinstance(value, dict) and "hash" in value: + refs.append((value["hash"], value.get("store"))) + + return refs + + +def _extract_object_refs(value: Any) -> list[tuple[str, str | None]]: + """ + Extract object path references from a stored value. + + Args: + value: The stored value (could be JSON string or dict) + + Returns: + List of (path, store_name) tuples + """ + refs = [] + + if value is None: + return refs + + # Parse JSON if string + if isinstance(value, str): + try: + value = json.loads(value) + except (json.JSONDecodeError, TypeError): + return refs + + # Extract path from dict + if isinstance(value, dict) and "path" in value: + refs.append((value["path"], value.get("store"))) + + return refs + + +def scan_references( + *schemas: "Schema", + store_name: str | None = None, + verbose: bool = False, +) -> set[str]: + """ + Scan schemas for content references. + + Examines all tables in the given schemas and extracts content hashes + from columns that use content-addressed storage (, , ). + + Args: + *schemas: Schema instances to scan + store_name: Only include references to this store (None = all stores) + verbose: Print progress information + + Returns: + Set of content hashes that are referenced + """ + referenced: set[str] = set() + + for schema in schemas: + if verbose: + logger.info(f"Scanning schema: {schema.database}") + + # Get all tables in schema + for table_name in schema.list_tables(): + try: + # Get table class + table = schema.spawn_table(table_name) + + # Check each attribute for content storage + for attr_name, attr in table.heading.attributes.items(): + if not _uses_content_storage(attr): + continue + + if verbose: + logger.info(f" Scanning {table_name}.{attr_name}") + + # Fetch all values for this attribute + # Use raw fetch to get JSON strings + try: + values = table.fetch(attr_name) + for value in values: + for content_hash, ref_store in _extract_content_refs(value): + # Filter by store if specified + if store_name is None or ref_store == store_name: + referenced.add(content_hash) + except Exception as e: + logger.warning(f"Error scanning {table_name}.{attr_name}: {e}") + + except Exception as e: + logger.warning(f"Error accessing table {table_name}: {e}") + + return referenced + + +def scan_object_references( + *schemas: "Schema", + store_name: str | None = None, + verbose: bool = False, +) -> set[str]: + """ + Scan schemas for object path references. + + Examines all tables in the given schemas and extracts object paths + from columns that use path-addressed storage (). + + Args: + *schemas: Schema instances to scan + store_name: Only include references to this store (None = all stores) + verbose: Print progress information + + Returns: + Set of object paths that are referenced + """ + referenced: set[str] = set() + + for schema in schemas: + if verbose: + logger.info(f"Scanning schema for objects: {schema.database}") + + # Get all tables in schema + for table_name in schema.list_tables(): + try: + # Get table class + table = schema.spawn_table(table_name) + + # Check each attribute for object storage + for attr_name, attr in table.heading.attributes.items(): + if not _uses_object_storage(attr): + continue + + if verbose: + logger.info(f" Scanning {table_name}.{attr_name}") + + # Fetch all values for this attribute + try: + values = table.fetch(attr_name) + for value in values: + for path, ref_store in _extract_object_refs(value): + # Filter by store if specified + if store_name is None or ref_store == store_name: + referenced.add(path) + except Exception as e: + logger.warning(f"Error scanning {table_name}.{attr_name}: {e}") + + except Exception as e: + logger.warning(f"Error accessing table {table_name}: {e}") + + return referenced + + +def list_stored_content(store_name: str | None = None) -> dict[str, int]: + """ + List all content hashes in storage. + + Scans the _content/ directory in the specified store and returns + all content hashes found. + + Args: + store_name: Store to scan (None = default store) + + Returns: + Dict mapping content_hash to size in bytes + """ + backend = get_store_backend(store_name) + stored: dict[str, int] = {} + + # Content is stored at _content/{hash[:2]}/{hash[2:4]}/{hash} + content_prefix = "_content/" + + try: + # List all files under _content/ + full_prefix = backend._full_path(content_prefix) + + for root, dirs, files in backend.fs.walk(full_prefix): + for filename in files: + # Skip manifest files + if filename.endswith(".manifest.json"): + continue + + # The filename is the full hash + content_hash = filename + + # Validate it looks like a hash (64 hex chars) + if len(content_hash) == 64 and all(c in "0123456789abcdef" for c in content_hash): + try: + file_path = f"{root}/{filename}" + size = backend.fs.size(file_path) + stored[content_hash] = size + except Exception: + stored[content_hash] = 0 + + except FileNotFoundError: + # No _content/ directory exists yet + pass + except Exception as e: + logger.warning(f"Error listing stored content: {e}") + + return stored + + +def list_stored_objects(store_name: str | None = None) -> dict[str, int]: + """ + List all object paths in storage. + + Scans for directories matching the object storage pattern: + {schema}/{table}/objects/{pk}/{field}_{token}/ + + Args: + store_name: Store to scan (None = default store) + + Returns: + Dict mapping object_path to size in bytes + """ + backend = get_store_backend(store_name) + stored: dict[str, int] = {} + + try: + # Walk the storage looking for /objects/ directories + full_prefix = backend._full_path("") + + for root, dirs, files in backend.fs.walk(full_prefix): + # Skip _content directory + if "_content" in root: + continue + + # Look for "objects" directory pattern + if "/objects/" in root: + # This could be an object storage path + # Path pattern: {schema}/{table}/objects/{pk}/{field}_{token} + relative_path = root.replace(full_prefix, "").lstrip("/") + + # Calculate total size of this object directory + total_size = 0 + for file in files: + try: + file_path = f"{root}/{file}" + total_size += backend.fs.size(file_path) + except Exception: + pass + + # Only count directories with files (actual objects) + if total_size > 0 or files: + stored[relative_path] = total_size + + except FileNotFoundError: + pass + except Exception as e: + logger.warning(f"Error listing stored objects: {e}") + + return stored + + +def delete_object(path: str, store_name: str | None = None) -> bool: + """ + Delete an object directory from storage. + + Args: + path: Object path (relative to store root) + store_name: Store name (None = default store) + + Returns: + True if deleted, False if not found + """ + backend = get_store_backend(store_name) + + try: + full_path = backend._full_path(path) + if backend.fs.exists(full_path): + # Remove entire directory tree + backend.fs.rm(full_path, recursive=True) + logger.debug(f"Deleted object: {path}") + return True + except Exception as e: + logger.warning(f"Error deleting object {path}: {e}") + + return False + + +def scan( + *schemas: "Schema", + store_name: str | None = None, + verbose: bool = False, +) -> dict[str, Any]: + """ + Scan for orphaned content and objects without deleting. + + Scans both content-addressed storage (for , , ) + and path-addressed storage (for ). + + Args: + *schemas: Schema instances to scan + store_name: Store to check (None = default store) + verbose: Print progress information + + Returns: + Dict with scan statistics: + - content_referenced: Number of content items referenced in database + - content_stored: Number of content items in storage + - content_orphaned: Number of unreferenced content items + - content_orphaned_bytes: Total size of orphaned content + - orphaned_hashes: List of orphaned content hashes + - object_referenced: Number of objects referenced in database + - object_stored: Number of objects in storage + - object_orphaned: Number of unreferenced objects + - object_orphaned_bytes: Total size of orphaned objects + - orphaned_paths: List of orphaned object paths + """ + if not schemas: + raise DataJointError("At least one schema must be provided") + + # --- Content-addressed storage --- + content_referenced = scan_references(*schemas, store_name=store_name, verbose=verbose) + content_stored = list_stored_content(store_name) + orphaned_hashes = set(content_stored.keys()) - content_referenced + content_orphaned_bytes = sum(content_stored.get(h, 0) for h in orphaned_hashes) + + # --- Path-addressed storage (objects) --- + object_referenced = scan_object_references(*schemas, store_name=store_name, verbose=verbose) + object_stored = list_stored_objects(store_name) + orphaned_paths = set(object_stored.keys()) - object_referenced + object_orphaned_bytes = sum(object_stored.get(p, 0) for p in orphaned_paths) + + return { + # Content-addressed storage stats + "content_referenced": len(content_referenced), + "content_stored": len(content_stored), + "content_orphaned": len(orphaned_hashes), + "content_orphaned_bytes": content_orphaned_bytes, + "orphaned_hashes": sorted(orphaned_hashes), + # Path-addressed storage stats + "object_referenced": len(object_referenced), + "object_stored": len(object_stored), + "object_orphaned": len(orphaned_paths), + "object_orphaned_bytes": object_orphaned_bytes, + "orphaned_paths": sorted(orphaned_paths), + # Combined totals + "referenced": len(content_referenced) + len(object_referenced), + "stored": len(content_stored) + len(object_stored), + "orphaned": len(orphaned_hashes) + len(orphaned_paths), + "orphaned_bytes": content_orphaned_bytes + object_orphaned_bytes, + } + + +def collect( + *schemas: "Schema", + store_name: str | None = None, + dry_run: bool = True, + verbose: bool = False, +) -> dict[str, Any]: + """ + Remove orphaned content and objects from storage. + + Scans the given schemas for content and object references, then removes any + storage items that are not referenced. + + Args: + *schemas: Schema instances to scan + store_name: Store to clean (None = default store) + dry_run: If True, report what would be deleted without deleting + verbose: Print progress information + + Returns: + Dict with collection statistics: + - referenced: Total items referenced in database + - stored: Total items in storage + - orphaned: Total unreferenced items + - content_deleted: Number of content items deleted + - object_deleted: Number of object items deleted + - deleted: Total items deleted (0 if dry_run) + - bytes_freed: Bytes freed (0 if dry_run) + - errors: Number of deletion errors + """ + # First scan to find orphaned content and objects + stats = scan(*schemas, store_name=store_name, verbose=verbose) + + content_deleted = 0 + object_deleted = 0 + bytes_freed = 0 + errors = 0 + + if not dry_run: + # Delete orphaned content (hash-addressed) + if stats["content_orphaned"] > 0: + content_stored = list_stored_content(store_name) + + for content_hash in stats["orphaned_hashes"]: + try: + size = content_stored.get(content_hash, 0) + if delete_content(content_hash, store_name): + content_deleted += 1 + bytes_freed += size + if verbose: + logger.info(f"Deleted content: {content_hash[:16]}... ({size} bytes)") + except Exception as e: + errors += 1 + logger.warning(f"Failed to delete content {content_hash[:16]}...: {e}") + + # Delete orphaned objects (path-addressed) + if stats["object_orphaned"] > 0: + object_stored = list_stored_objects(store_name) + + for path in stats["orphaned_paths"]: + try: + size = object_stored.get(path, 0) + if delete_object(path, store_name): + object_deleted += 1 + bytes_freed += size + if verbose: + logger.info(f"Deleted object: {path} ({size} bytes)") + except Exception as e: + errors += 1 + logger.warning(f"Failed to delete object {path}: {e}") + + return { + "referenced": stats["referenced"], + "stored": stats["stored"], + "orphaned": stats["orphaned"], + "content_deleted": content_deleted, + "object_deleted": object_deleted, + "deleted": content_deleted + object_deleted, + "bytes_freed": bytes_freed, + "errors": errors, + "dry_run": dry_run, + # Include detailed stats + "content_orphaned": stats["content_orphaned"], + "object_orphaned": stats["object_orphaned"], + } + + +def format_stats(stats: dict[str, Any]) -> str: + """ + Format GC statistics as a human-readable string. + + Args: + stats: Statistics dict from scan() or collect() + + Returns: + Formatted string + """ + lines = ["External Storage Statistics:"] + + # Show content-addressed storage stats if present + if "content_referenced" in stats: + lines.append("") + lines.append("Content-Addressed Storage (, , ):") + lines.append(f" Referenced: {stats['content_referenced']}") + lines.append(f" Stored: {stats['content_stored']}") + lines.append(f" Orphaned: {stats['content_orphaned']}") + if "content_orphaned_bytes" in stats: + size_mb = stats["content_orphaned_bytes"] / (1024 * 1024) + lines.append(f" Orphaned size: {size_mb:.2f} MB") + + # Show path-addressed storage stats if present + if "object_referenced" in stats: + lines.append("") + lines.append("Path-Addressed Storage ():") + lines.append(f" Referenced: {stats['object_referenced']}") + lines.append(f" Stored: {stats['object_stored']}") + lines.append(f" Orphaned: {stats['object_orphaned']}") + if "object_orphaned_bytes" in stats: + size_mb = stats["object_orphaned_bytes"] / (1024 * 1024) + lines.append(f" Orphaned size: {size_mb:.2f} MB") + + # Show totals + lines.append("") + lines.append("Totals:") + lines.append(f" Referenced in database: {stats['referenced']}") + lines.append(f" Stored in backend: {stats['stored']}") + lines.append(f" Orphaned (unreferenced): {stats['orphaned']}") + + if "orphaned_bytes" in stats: + size_mb = stats["orphaned_bytes"] / (1024 * 1024) + lines.append(f" Orphaned size: {size_mb:.2f} MB") + + # Show deletion results if this is from collect() + if "deleted" in stats: + lines.append("") + if stats.get("dry_run", True): + lines.append(" [DRY RUN - no changes made]") + else: + lines.append(f" Deleted: {stats['deleted']}") + if "content_deleted" in stats: + lines.append(f" Content: {stats['content_deleted']}") + if "object_deleted" in stats: + lines.append(f" Objects: {stats['object_deleted']}") + freed_mb = stats["bytes_freed"] / (1024 * 1024) + lines.append(f" Bytes freed: {freed_mb:.2f} MB") + if stats.get("errors", 0) > 0: + lines.append(f" Errors: {stats['errors']}") + + return "\n".join(lines) diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index f4bd57a79..3221522fd 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -5,15 +5,45 @@ import numpy as np -from .attribute_adapter import AttributeAdapter, get_adapter +from .attribute_type import get_adapter +from .attribute_type import AttributeType from .declare import ( - EXTERNAL_TYPES, - NATIVE_TYPES, + CORE_TYPE_NAMES, SPECIAL_TYPES, TYPE_PATTERN, - UUID_DATA_TYPE, ) -from .errors import FILEPATH_FEATURE_SWITCH, DataJointError, _support_filepath_types +from .errors import DataJointError + + +class _MissingType(AttributeType): + """Placeholder for missing/unregistered attribute types. Raises error on use.""" + + def __init__(self, name: str): + self._name = name + + @property + def type_name(self) -> str: + return self._name + + @property + def dtype(self) -> str: + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + def encode(self, value, *, key=None): + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + + def decode(self, stored, *, key=None): + raise DataJointError( + f"Attribute type <{self._name}> is not registered. " + "Register it with @dj.register_type or include it in the schema context." + ) + logger = logging.getLogger(__name__.split(".")[0]) @@ -30,16 +60,11 @@ uuid=False, json=None, is_blob=False, - is_attachment=False, - is_filepath=False, - is_object=False, - is_external=False, is_hidden=False, adapter=None, store=None, unsupported=False, attribute_expression=None, - database=None, dtype=object, ) @@ -56,11 +81,13 @@ def todict(self): @property def sql_type(self): """:return: datatype (as string) in database. In most cases, it is the same as self.type""" - return UUID_DATA_TYPE if self.uuid else self.type + # UUID is now a core type alias - already resolved to binary(16) + return self.type @property def sql_comment(self): """:return: full comment for the SQL declaration. Includes custom type specification""" + # UUID info is stored in the comment for reconstruction return (":uuid:" if self.uuid else "") + self.comment @property @@ -135,17 +162,10 @@ def secondary_attributes(self): def blobs(self): return [k for k, v in self.attributes.items() if v.is_blob] - @property - def objects(self): - return [k for k, v in self.attributes.items() if v.is_object] - @property def non_blobs(self): - return [ - k - for k, v in self.attributes.items() - if not (v.is_blob or v.is_attachment or v.is_filepath or v.is_object or v.json) - ] + """Attributes that are not blobs or JSON (used for simple column handling).""" + return [k for k, v in self.attributes.items() if not (v.is_blob or v.json)] @property def new_attributes(self): @@ -261,20 +281,15 @@ def _init_from_database(self): for attr in attributes: attr.update( in_key=(attr["in_key"] == "PRI"), - database=database, nullable=attr["nullable"] == "YES", autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)), numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")), string=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("ENUM", "TEMPORAL", "STRING")), - is_blob=bool(TYPE_PATTERN["INTERNAL_BLOB"].match(attr["type"])), + is_blob=bool(TYPE_PATTERN["BLOB"].match(attr["type"])), uuid=False, json=bool(TYPE_PATTERN["JSON"].match(attr["type"])), - is_attachment=False, - is_filepath=False, - is_object=False, adapter=None, store=None, - is_external=False, attribute_expression=None, is_hidden=attr["name"].startswith("_"), ) @@ -284,30 +299,34 @@ def _init_from_database(self): attr["unsupported"] = not any((attr["is_blob"], attr["numeric"], attr["numeric"])) attr.pop("Extra") - # process custom DataJoint types + # process custom DataJoint types stored in comment special = re.match(r":(?P[^:]+):(?P.*)", attr["comment"]) if special: special = special.groupdict() attr.update(special) - # process adapted attribute types + + # process AttributeTypes (adapted types in angle brackets) if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): assert context is not None, "Declaration context is not set" adapter_name = special["type"] try: - attr.update(adapter=get_adapter(context, adapter_name)) + adapter_result = get_adapter(context, adapter_name) + # get_adapter returns (adapter, store_name) tuple + if isinstance(adapter_result, tuple): + attr["adapter"], attr["store"] = adapter_result + else: + attr["adapter"] = adapter_result except DataJointError: # if no adapter, then delay the error until the first invocation - attr.update(adapter=AttributeAdapter()) + attr["adapter"] = _MissingType(adapter_name) else: - attr.update(type=attr["adapter"].attribute_type) + attr["type"] = attr["adapter"].dtype if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): - raise DataJointError( - "Invalid attribute type '{type}' in adapter object <{adapter_name}>.".format( - adapter_name=adapter_name, **attr - ) - ) - special = not any(TYPE_PATTERN[c].match(attr["type"]) for c in NATIVE_TYPES) + raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") + # Update is_blob based on resolved dtype + attr["is_blob"] = bool(TYPE_PATTERN["BLOB"].match(attr["type"])) + # Handle core type aliases (uuid, float32, etc.) if special: try: category = next(c for c in SPECIAL_TYPES if TYPE_PATTERN[c].match(attr["type"])) @@ -322,46 +341,18 @@ def _init_from_database(self): url=url, **attr ) ) - raise DataJointError("Unknown attribute type `{type}`".format(**attr)) - if category == "FILEPATH" and not _support_filepath_types(): - raise DataJointError( - """ - The filepath data type is disabled until complete validation. - To turn it on as experimental feature, set the environment variable - {env} = TRUE or upgrade datajoint. - """.format(env=FILEPATH_FEATURE_SWITCH) - ) - # Extract store name for external types and object types with named stores - store = None - if category in EXTERNAL_TYPES: - store = attr["type"].split("@")[1] - elif category == "OBJECT" and "@" in attr["type"]: - store = attr["type"].split("@")[1] - - attr.update( - unsupported=False, - is_attachment=category in ("INTERNAL_ATTACH", "EXTERNAL_ATTACH"), - is_filepath=category == "FILEPATH", - is_object=category == "OBJECT", - # INTERNAL_BLOB is not a custom type but is included for completeness - is_blob=category in ("INTERNAL_BLOB", "EXTERNAL_BLOB"), - uuid=category == "UUID", - is_external=category in EXTERNAL_TYPES, - store=store, - ) + # Not a special type - that's fine, could be native passthrough + category = None - if attr["in_key"] and any( - ( - attr["is_blob"], - attr["is_attachment"], - attr["is_filepath"], - attr["is_object"], - attr["json"], - ) - ): - raise DataJointError( - "Json, Blob, attachment, filepath, or object attributes " "are not allowed in the primary key" - ) + if category == "UUID": + attr["uuid"] = True + elif category in CORE_TYPE_NAMES: + # Core type alias - already resolved in DB + pass + + # Check primary key constraints + if attr["in_key"] and (attr["is_blob"] or attr["json"]): + raise DataJointError("Blob or JSON attributes are not allowed in the primary key") if attr["string"] and attr["default"] is not None and attr["default"] not in sql_literals: attr["default"] = '"%s"' % attr["default"] @@ -382,7 +373,7 @@ def _init_from_database(self): attr["dtype"] = numeric_types[(t, is_unsigned)] if attr["adapter"]: - # restore adapted type name + # restore adapted type name for display attr["type"] = adapter_name self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes)) diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py new file mode 100644 index 000000000..696ca380e --- /dev/null +++ b/src/datajoint/migrate.py @@ -0,0 +1,250 @@ +""" +Migration utilities for DataJoint schema updates. + +This module provides tools for migrating existing schemas to use the new +AttributeType system, particularly for upgrading blob columns to use +explicit `` type declarations. +""" + +from __future__ import annotations + +import logging +import re +from typing import TYPE_CHECKING + +from .errors import DataJointError + +if TYPE_CHECKING: + from .schemas import Schema + +logger = logging.getLogger(__name__.split(".")[0]) + +# Pattern to detect blob types +BLOB_TYPES = re.compile(r"^(tiny|small|medium|long|)blob$", re.I) + + +def analyze_blob_columns(schema: Schema) -> list[dict]: + """ + Analyze a schema to find blob columns that could be migrated to . + + This function identifies blob columns that: + 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) + 2. Do NOT already have an adapter/type specified in their comment + + All blob size variants are included in the analysis. + + Args: + schema: The DataJoint schema to analyze. + + Returns: + List of dicts with keys: + - table_name: Full table name (database.table) + - column_name: Name of the blob column + - column_type: MySQL column type (tinyblob, blob, mediumblob, longblob) + - current_comment: Current column comment + - needs_migration: True if column should be migrated + + Example: + >>> import datajoint as dj + >>> schema = dj.schema('my_database') + >>> columns = dj.migrate.analyze_blob_columns(schema) + >>> for col in columns: + ... if col['needs_migration']: + ... print(f"{col['table_name']}.{col['column_name']} ({col['column_type']})") + """ + results = [] + + connection = schema.connection + + # Get all tables in the schema + tables_query = """ + SELECT TABLE_NAME + FROM information_schema.TABLES + WHERE TABLE_SCHEMA = %s + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_NAME NOT LIKE '~%%' + """ + + tables = connection.query(tables_query, args=(schema.database,)).fetchall() + + for (table_name,) in tables: + # Get column information for each table + columns_query = """ + SELECT COLUMN_NAME, COLUMN_TYPE, COLUMN_COMMENT + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s + AND TABLE_NAME = %s + AND DATA_TYPE IN ('tinyblob', 'blob', 'mediumblob', 'longblob') + """ + + columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() + + for column_name, column_type, comment in columns: + # Check if comment already has an adapter type (starts with :type:) + has_adapter = comment and comment.startswith(":") + + results.append( + { + "table_name": f"{schema.database}.{table_name}", + "column_name": column_name, + "column_type": column_type, + "current_comment": comment or "", + "needs_migration": not has_adapter, + } + ) + + return results + + +def generate_migration_sql( + schema: Schema, + target_type: str = "djblob", + dry_run: bool = True, +) -> list[str]: + """ + Generate SQL statements to migrate blob columns to use . + + This generates ALTER TABLE statements that update column comments to + include the `::` prefix, marking them as using explicit + DataJoint blob serialization. + + Args: + schema: The DataJoint schema to migrate. + target_type: The type name to migrate to (default: "djblob"). + dry_run: If True, only return SQL without executing. + + Returns: + List of SQL ALTER TABLE statements. + + Example: + >>> sql_statements = dj.migrate.generate_migration_sql(schema) + >>> for sql in sql_statements: + ... print(sql) + + Note: + This is a metadata-only migration. The actual blob data format + remains unchanged - only the column comments are updated to + indicate explicit type handling. + """ + columns = analyze_blob_columns(schema) + sql_statements = [] + + for col in columns: + if not col["needs_migration"]: + continue + + # Build new comment with type prefix + old_comment = col["current_comment"] + new_comment = f":<{target_type}>:{old_comment}" + + # Escape special characters for SQL + new_comment_escaped = new_comment.replace("\\", "\\\\").replace("'", "\\'") + + # Parse table name + db_name, table_name = col["table_name"].split(".") + + # Generate ALTER TABLE statement + sql = ( + f"ALTER TABLE `{db_name}`.`{table_name}` " + f"MODIFY COLUMN `{col['column_name']}` {col['column_type']} " + f"COMMENT '{new_comment_escaped}'" + ) + sql_statements.append(sql) + + return sql_statements + + +def migrate_blob_columns( + schema: Schema, + target_type: str = "djblob", + dry_run: bool = True, +) -> dict: + """ + Migrate blob columns in a schema to use explicit type. + + This updates column comments in the database to include the type + declaration. The data format remains unchanged. + + Args: + schema: The DataJoint schema to migrate. + target_type: The type name to migrate to (default: "djblob"). + dry_run: If True, only preview changes without applying. + + Returns: + Dict with keys: + - analyzed: Number of blob columns analyzed + - needs_migration: Number of columns that need migration + - migrated: Number of columns migrated (0 if dry_run) + - sql_statements: List of SQL statements (executed or to be executed) + + Example: + >>> # Preview migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=True) + >>> print(f"Would migrate {result['needs_migration']} columns") + + >>> # Apply migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=False) + >>> print(f"Migrated {result['migrated']} columns") + + Warning: + After migration, table definitions should be updated to use + `` instead of `longblob` for consistency. The migration + only updates database metadata; source code changes are manual. + """ + columns = analyze_blob_columns(schema) + sql_statements = generate_migration_sql(schema, target_type=target_type) + + result = { + "analyzed": len(columns), + "needs_migration": sum(1 for c in columns if c["needs_migration"]), + "migrated": 0, + "sql_statements": sql_statements, + } + + if dry_run: + logger.info(f"Dry run: would migrate {result['needs_migration']} columns") + for sql in sql_statements: + logger.info(f" {sql}") + return result + + # Execute migrations + connection = schema.connection + for sql in sql_statements: + try: + connection.query(sql) + result["migrated"] += 1 + logger.info(f"Executed: {sql}") + except Exception as e: + logger.error(f"Failed to execute: {sql}\nError: {e}") + raise DataJointError(f"Migration failed: {e}") from e + + logger.info(f"Successfully migrated {result['migrated']} columns") + return result + + +def check_migration_status(schema: Schema) -> dict: + """ + Check the migration status of blob columns in a schema. + + Args: + schema: The DataJoint schema to check. + + Returns: + Dict with keys: + - total_blob_columns: Total number of blob columns + - migrated: Number of columns with explicit type + - pending: Number of columns using implicit serialization + - columns: List of column details + + Example: + >>> status = dj.migrate.check_migration_status(schema) + >>> print(f"Migration progress: {status['migrated']}/{status['total_blob_columns']}") + """ + columns = analyze_blob_columns(schema) + + return { + "total_blob_columns": len(columns), + "migrated": sum(1 for c in columns if not c["needs_migration"]), + "pending": sum(1 for c in columns if c["needs_migration"]), + "columns": columns, + } diff --git a/src/datajoint/preview.py b/src/datajoint/preview.py index 5c61db1da..7572125e9 100644 --- a/src/datajoint/preview.py +++ b/src/datajoint/preview.py @@ -27,7 +27,8 @@ def _format_object_display(json_data): def preview(query_expression, limit, width): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - object_fields = heading.objects + # Object fields are AttributeTypes with adapters - not specially handled in simplified model + object_fields = [] if limit is None: limit = config["display.limit"] if width is None: @@ -87,7 +88,8 @@ def get_display_value(tup, f, idx): def repr_html(query_expression): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - object_fields = heading.objects + # Object fields are AttributeTypes with adapters - not specially handled in simplified model + object_fields = [] info = heading.table_status tuples = rel.fetch(limit=config["display.limit"] + 1, format="array") has_more = len(tuples) > config["display.limit"] diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py index e9b83efff..0b42f0104 100644 --- a/src/datajoint/schemas.py +++ b/src/datajoint/schemas.py @@ -8,7 +8,6 @@ from .connection import conn from .errors import AccessError, DataJointError -from .external import ExternalMapping from .heading import Heading from .jobs import JobTable from .settings import config @@ -71,7 +70,6 @@ def __init__( self.create_schema = create_schema self.create_tables = create_tables self._jobs = None - self.external = ExternalMapping(self) self.add_objects = add_objects self.declare_list = [] if schema_name: diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py index 9083bb78b..3a3d5bd17 100644 --- a/src/datajoint/staged_insert.py +++ b/src/datajoint/staged_insert.py @@ -98,8 +98,9 @@ def _get_storage_path(self, field: str, ext: str = "") -> str: raise DataJointError(f"Attribute '{field}' not found in table heading") attr = self._table.heading[field] - if not attr.is_object: - raise DataJointError(f"Attribute '{field}' is not an object type") + # Check if this is an object AttributeType (has adapter with "object" in type_name) + if not (attr.adapter and hasattr(attr.adapter, "type_name") and "object" in attr.adapter.type_name): + raise DataJointError(f"Attribute '{field}' is not an type") # Extract primary key from rec primary_key = {k: self._rec[k] for k in self._table.primary_key if k in self._rec} diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 356c538ed..dce1e70ab 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -14,7 +14,6 @@ import numpy as np import pandas -from . import blob from .condition import make_condition from .declare import alter, declare from .errors import ( @@ -103,12 +102,9 @@ def declare(self, context=None): "Table class name `{name}` is invalid. Please use CamelCase. ".format(name=self.class_name) + "Classes defining tables should be formatted in strict CamelCase." ) - sql, external_stores = declare(self.full_table_name, self.definition, context) + sql, _external_stores = declare(self.full_table_name, self.definition, context) sql = sql.format(database=self.database) try: - # declare all external tables before declaring main table - for store in external_stores: - self.connection.schemas[self.database].external[store] self.connection.query(sql) except AccessError: # skip if no create privilege @@ -127,7 +123,7 @@ def alter(self, prompt=True, context=None): context = dict(frame.f_globals, **frame.f_locals) del frame old_definition = self.describe(context=context) - sql, external_stores = alter(self.definition, old_definition, context) + sql, _external_stores = alter(self.definition, old_definition, context) if not sql: if prompt: logger.warning("Nothing to alter.") @@ -135,9 +131,6 @@ def alter(self, prompt=True, context=None): sql = "ALTER TABLE {tab}\n\t".format(tab=self.full_table_name) + ",\n\t".join(sql) if not prompt or user_choice(sql + "\n\nExecute?") == "yes": try: - # declare all external tables before declaring main table - for store in external_stores: - self.connection.schemas[self.database].external[store] self.connection.query(sql) except AccessError: # skip if no create privilege @@ -352,7 +345,7 @@ def _process_object_value(self, name: str, value, row: dict, store_name: str | N size = source_path.stat().st_size else: raise DataJointError( - f"Invalid value type for object attribute {name}. " "Expected file path, folder path, or (ext, stream) tuple." + f"Invalid value type for object attribute {name}. Expected file path, folder path, or (ext, stream) tuple." ) # Get storage spec for path building @@ -925,52 +918,62 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): as a string to be included in the query and the value, if any, to be submitted for processing by mysql API. + In the simplified type system: + - Adapters (AttributeTypes) handle all custom encoding via type chains + - UUID values are converted to bytes + - JSON values are serialized + - Blob values pass through as bytes + - Numeric values are stringified + :param name: name of attribute to be inserted :param value: value of attribute to be inserted :param ignore_extra_fields: if True, return None for unknown fields - :param row: the full row dict (needed for object attributes to extract primary key) + :param row: the full row dict (unused in simplified model) """ if ignore_extra_fields and name not in self.heading: return None attr = self.heading[name] + + # Apply adapter encoding with type chain support if attr.adapter: - value = attr.adapter.put(value) + from .attribute_type import resolve_dtype + + attr.adapter.validate(value) + + # Resolve full type chain + _, type_chain, resolved_store = resolve_dtype(f"<{attr.adapter.type_name}>", store_name=attr.store) + + # Apply encoders from outermost to innermost + for attr_type in type_chain: + # Pass store_name to encoders that support it + try: + value = attr_type.encode(value, key=None, store_name=resolved_store) + except TypeError: + # Encoder doesn't accept store_name parameter + value = attr_type.encode(value, key=None) + + # Handle NULL values if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))): - # set default value placeholder, value = "DEFAULT", None - else: # not NULL + else: placeholder = "%s" + # UUID - convert to bytes if attr.uuid: if not isinstance(value, uuid.UUID): try: value = uuid.UUID(value) except (AttributeError, ValueError): - raise DataJointError("badly formed UUID value {v} for attribute `{n}`".format(v=value, n=name)) + raise DataJointError(f"badly formed UUID value {value} for attribute `{name}`") value = value.bytes - elif attr.is_blob: - value = blob.pack(value) - value = self.external[attr.store].put(value).bytes if attr.is_external else value - elif attr.is_attachment: - attachment_path = Path(value) - if attr.is_external: - # value is hash of contents - value = self.external[attr.store].upload_attachment(attachment_path).bytes - else: - # value is filename + contents - value = str.encode(attachment_path.name) + b"\0" + attachment_path.read_bytes() - elif attr.is_filepath: - value = self.external[attr.store].upload_filepath(value).bytes - elif attr.is_object: - # Object type - upload to object storage and return JSON metadata - if row is None: - raise DataJointError( - f"Object attribute {name} requires full row context for insert. " "This is an internal error." - ) - value = self._process_object_value(name, value, row, store_name=attr.store) - elif attr.numeric: - value = str(int(value) if isinstance(value, bool) else value) + # JSON - serialize to string elif attr.json: value = json.dumps(value) + # Numeric - convert to string + elif attr.numeric: + value = str(int(value) if isinstance(value, bool) else value) + # Blob - pass through as bytes (adapters handle serialization) + # elif attr.is_blob: pass through unchanged + return name, placeholder, value def __make_row_to_insert(self, row, field_list, ignore_extra_fields): diff --git a/src/datajoint/user_tables.py b/src/datajoint/user_tables.py index d7faeb285..fa26bc9c6 100644 --- a/src/datajoint/user_tables.py +++ b/src/datajoint/user_tables.py @@ -7,7 +7,7 @@ from .autopopulate import AutoPopulate from .errors import DataJointError from .table import Table -from .utils import ClassProperty, from_camel_case +from .utils import from_camel_case _base_regexp = r"[a-z][a-z0-9]*(_[a-z][a-z0-9]*)*" @@ -78,6 +78,26 @@ def __add__(cls, arg): def __iter__(cls): return iter(cls()) + # Class properties - defined on metaclass to work at class level + @property + def connection(cls): + """The database connection for this table.""" + return cls._connection + + @property + def table_name(cls): + """The table name formatted for MySQL.""" + if cls._prefix is None: + raise AttributeError("Class prefix is not defined!") + return cls._prefix + from_camel_case(cls.__name__) + + @property + def full_table_name(cls): + """The fully qualified table name (`database`.`table`).""" + if cls.database is None: + return None + return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) + class UserTable(Table, metaclass=TableMeta): """ @@ -101,27 +121,6 @@ def definition(self): """ raise NotImplementedError('Subclasses of Table must implement the property "definition"') - @ClassProperty - def connection(cls): - return cls._connection - - @ClassProperty - def table_name(cls): - """ - :return: the table name of the table formatted for mysql. - """ - if cls._prefix is None: - raise AttributeError("Class prefix is not defined!") - return cls._prefix + from_camel_case(cls.__name__) - - @ClassProperty - def full_table_name(cls): - if cls not in {Manual, Imported, Lookup, Computed, Part, UserTable}: - # for derived classes only - if cls.database is None: - raise DataJointError("Class %s is not properly declared (schema decorator not applied?)" % cls.__name__) - return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) - class Manual(UserTable): """ @@ -163,7 +162,28 @@ class Computed(UserTable, AutoPopulate): tier_regexp = r"(?P" + _prefix + _base_regexp + ")" -class Part(UserTable): +class PartMeta(TableMeta): + """Metaclass for Part tables with overridden class properties.""" + + @property + def table_name(cls): + """The table name for a Part is derived from its master table.""" + return None if cls.master is None else cls.master.table_name + "__" + from_camel_case(cls.__name__) + + @property + def full_table_name(cls): + """The fully qualified table name (`database`.`table`).""" + if cls.database is None or cls.table_name is None: + return None + return r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) + + @property + def master(cls): + """The master table for this Part table.""" + return cls._master + + +class Part(UserTable, metaclass=PartMeta): """ Inherit from this class if the table's values are details of an entry in another table and if this table is populated by the other table. For example, the entries inheriting from @@ -184,24 +204,6 @@ class Part(UserTable): + ")" ) - @ClassProperty - def connection(cls): - return cls._connection - - @ClassProperty - def full_table_name(cls): - return ( - None if cls.database is None or cls.table_name is None else r"`{0:s}`.`{1:s}`".format(cls.database, cls.table_name) - ) - - @ClassProperty - def master(cls): - return cls._master - - @ClassProperty - def table_name(cls): - return None if cls.master is None else cls.master.table_name + "__" + from_camel_case(cls.__name__) - def delete(self, force=False): """ unless force is True, prohibits direct deletes from parts. diff --git a/src/datajoint/utils.py b/src/datajoint/utils.py index 16927965e..e8303a993 100644 --- a/src/datajoint/utils.py +++ b/src/datajoint/utils.py @@ -7,14 +7,6 @@ from .errors import DataJointError -class ClassProperty: - def __init__(self, f): - self.f = f - - def __get__(self, obj, owner): - return self.f(owner) - - def user_choice(prompt, choices=("yes", "no"), default=None): """ Prompts the user for confirmation. The default value, if any, is capitalized. diff --git a/tests/conftest.py b/tests/conftest.py index c2f2a5ae9..d90bfc867 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,7 +16,6 @@ import datajoint as dj from datajoint.errors import ( - ADAPTED_TYPE_SWITCH, FILEPATH_FEATURE_SWITCH, DataJointError, ) @@ -334,10 +333,14 @@ def monkeymodule(): @pytest.fixture -def enable_adapted_types(monkeypatch): - monkeypatch.setenv(ADAPTED_TYPE_SWITCH, "TRUE") +def enable_adapted_types(): + """ + Deprecated fixture - custom attribute types no longer require a feature flag. + + This fixture is kept for backward compatibility but does nothing. + Custom types are now enabled by default via the AttributeType system. + """ yield - monkeypatch.delenv(ADAPTED_TYPE_SWITCH, raising=True) @pytest.fixture diff --git a/tests/schema_adapted.py b/tests/schema_adapted.py index c7b5830c0..321edfc7b 100644 --- a/tests/schema_adapted.py +++ b/tests/schema_adapted.py @@ -7,40 +7,42 @@ import datajoint as dj -class GraphAdapter(dj.AttributeAdapter): - attribute_type = "longblob" # this is how the attribute will be declared +@dj.register_type +class GraphType(dj.AttributeType): + """Custom type for storing NetworkX graphs as edge lists.""" - @staticmethod - def get(obj): - # convert edge list into a graph - return nx.Graph(obj) + type_name = "graph" + dtype = "longblob" - @staticmethod - def put(obj): - # convert graph object into an edge list + def encode(self, obj, *, key=None): + """Convert graph object into an edge list.""" assert isinstance(obj, nx.Graph) return list(obj.edges) + def decode(self, stored, *, key=None): + """Convert edge list into a graph.""" + return nx.Graph(stored) -class LayoutToFilepath(dj.AttributeAdapter): - """ - An adapted data type that saves a graph layout into fixed filepath - """ - attribute_type = "filepath@repo-s3" +@dj.register_type +class LayoutToFilepathType(dj.AttributeType): + """Custom type that saves a graph layout to a filepath.""" - @staticmethod - def get(path): - with open(path, "r") as f: - return json.load(f) + type_name = "layout_to_filepath" + dtype = "filepath@repo-s3" - @staticmethod - def put(layout): + def encode(self, layout, *, key=None): + """Save layout to file and return path.""" path = Path(dj.config["stores"]["repo-s3"]["stage"], "layout.json") with open(str(path), "w") as f: json.dump(layout, f) return path + def decode(self, path, *, key=None): + """Load layout from file.""" + with open(path, "r") as f: + return json.load(f) + class Connectivity(dj.Manual): definition = """ diff --git a/tests/test_adapted_attributes.py b/tests/test_adapted_attributes.py index 1060a50ed..eb5cd760d 100644 --- a/tests/test_adapted_attributes.py +++ b/tests/test_adapted_attributes.py @@ -1,3 +1,9 @@ +""" +Tests for adapted/custom attribute types. + +These tests verify the AttributeType system for custom data types. +""" + from itertools import zip_longest import networkx as nx @@ -14,27 +20,17 @@ def schema_name(prefix): return prefix + "_test_custom_datatype" -@pytest.fixture -def adapted_graph_instance(): - yield schema_adapted.GraphAdapter() - - @pytest.fixture def schema_ad( connection_test, - adapted_graph_instance, - enable_adapted_types, enable_filepath_feature, s3_creds, tmpdir, schema_name, ): dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} - context = { - **schema_adapted.LOCALS_ADAPTED, - "graph": adapted_graph_instance, - "layout_to_filepath": schema_adapted.LayoutToFilepath(), - } + # Types are registered globally via @dj.register_type decorator in schema_adapted + context = {**schema_adapted.LOCALS_ADAPTED} schema = dj.schema(schema_name, context=context, connection=connection_test) schema(schema_adapted.Connectivity) schema(schema_adapted.Layout) @@ -52,9 +48,10 @@ def local_schema(schema_ad, schema_name): @pytest.fixture -def schema_virtual_module(schema_ad, adapted_graph_instance, schema_name): +def schema_virtual_module(schema_ad, schema_name): """Fixture for testing virtual modules""" - schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, add_objects={"graph": adapted_graph_instance}) + # Types are registered globally, no need to add_objects for adapters + schema_virtual_module = dj.VirtualModule("virtual_module", schema_name) return schema_virtual_module @@ -92,7 +89,7 @@ def test_adapted_filepath_type(schema_ad, minio_client): c.delete() -def test_adapted_spawned(local_schema, enable_adapted_types): +def test_adapted_spawned(local_schema): c = Connectivity() # a spawned class graphs = [ nx.lollipop_graph(4, 2), diff --git a/tests/test_attribute_type.py b/tests/test_attribute_type.py new file mode 100644 index 000000000..e9220bfd4 --- /dev/null +++ b/tests/test_attribute_type.py @@ -0,0 +1,412 @@ +""" +Tests for the new AttributeType system. +""" + +import pytest + +import datajoint as dj +from datajoint.attribute_type import ( + AttributeType, + _type_registry, + get_type, + is_type_registered, + list_types, + register_type, + resolve_dtype, + unregister_type, +) +from datajoint.errors import DataJointError + + +class TestAttributeTypeRegistry: + """Tests for the type registry functionality.""" + + def setup_method(self): + """Clear any test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_register_type_decorator(self): + """Test registering a type using the decorator.""" + + @register_type + class TestType(AttributeType): + type_name = "test_decorator" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_type_registered("test_decorator") + assert get_type("test_decorator").type_name == "test_decorator" + + def test_register_type_direct(self): + """Test registering a type by calling register_type directly.""" + + class TestType(AttributeType): + type_name = "test_direct" + dtype = "varchar(255)" + + def encode(self, value, *, key=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + register_type(TestType) + assert is_type_registered("test_direct") + + def test_register_type_idempotent(self): + """Test that registering the same type twice is idempotent.""" + + @register_type + class TestType(AttributeType): + type_name = "test_idempotent" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + # Second registration should not raise + register_type(TestType) + assert is_type_registered("test_idempotent") + + def test_register_duplicate_name_different_class(self): + """Test that registering different classes with same name raises error.""" + + @register_type + class TestType1(AttributeType): + type_name = "test_duplicate" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class TestType2(AttributeType): + type_name = "test_duplicate" + dtype = "varchar(100)" + + def encode(self, value, *, key=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="already registered"): + register_type(TestType2) + + def test_unregister_type(self): + """Test unregistering a type.""" + + @register_type + class TestType(AttributeType): + type_name = "test_unregister" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_type_registered("test_unregister") + unregister_type("test_unregister") + assert not is_type_registered("test_unregister") + + def test_get_type_not_found(self): + """Test that getting an unregistered type raises error.""" + with pytest.raises(DataJointError, match="Unknown attribute type"): + get_type("nonexistent_type") + + def test_list_types(self): + """Test listing registered types.""" + + @register_type + class TestType(AttributeType): + type_name = "test_list" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + types = list_types() + assert "test_list" in types + assert types == sorted(types) # Should be sorted + + def test_get_type_strips_brackets(self): + """Test that get_type accepts names with or without angle brackets.""" + + @register_type + class TestType(AttributeType): + type_name = "test_brackets" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert get_type("test_brackets") is get_type("") + + +class TestAttributeTypeValidation: + """Tests for the validate method.""" + + def setup_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_validate_called_default(self): + """Test that default validate accepts any value.""" + + @register_type + class TestType(AttributeType): + type_name = "test_validate_default" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + t = get_type("test_validate_default") + # Default validate should not raise for any value + t.validate(None) + t.validate(42) + t.validate("string") + t.validate([1, 2, 3]) + + def test_validate_custom(self): + """Test custom validation logic.""" + + @register_type + class PositiveIntType(AttributeType): + type_name = "test_positive_int" + dtype = "int" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + def validate(self, value): + if not isinstance(value, int): + raise TypeError(f"Expected int, got {type(value).__name__}") + if value < 0: + raise ValueError("Value must be positive") + + t = get_type("test_positive_int") + t.validate(42) # Should pass + + with pytest.raises(TypeError): + t.validate("not an int") + + with pytest.raises(ValueError): + t.validate(-1) + + +class TestTypeChaining: + """Tests for type chaining (dtype referencing another custom type).""" + + def setup_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_resolve_native_dtype(self): + """Test resolving a native dtype.""" + final_dtype, chain = resolve_dtype("longblob") + assert final_dtype == "longblob" + assert chain == [] + + def test_resolve_custom_dtype(self): + """Test resolving a custom dtype.""" + + @register_type + class TestType(AttributeType): + type_name = "test_resolve" + dtype = "varchar(100)" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain = resolve_dtype("") + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].type_name == "test_resolve" + + def test_resolve_chained_dtype(self): + """Test resolving a chained dtype.""" + + @register_type + class InnerType(AttributeType): + type_name = "test_inner" + dtype = "longblob" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class OuterType(AttributeType): + type_name = "test_outer" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain = resolve_dtype("") + assert final_dtype == "longblob" + assert len(chain) == 2 + assert chain[0].type_name == "test_outer" + assert chain[1].type_name == "test_inner" + + def test_circular_reference_detection(self): + """Test that circular type references are detected.""" + + @register_type + class TypeA(AttributeType): + type_name = "test_circular_a" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TypeB(AttributeType): + type_name = "test_circular_b" + dtype = "" + + def encode(self, value, *, key=None): + return value + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="Circular type reference"): + resolve_dtype("") + + +class TestExportsAndAPI: + """Test that the public API is properly exported.""" + + def test_exports_from_datajoint(self): + """Test that AttributeType and helpers are exported from datajoint.""" + assert hasattr(dj, "AttributeType") + assert hasattr(dj, "register_type") + assert hasattr(dj, "list_types") + + +class TestDJBlobType: + """Tests for the built-in DJBlobType.""" + + def test_djblob_is_registered(self): + """Test that djblob is automatically registered.""" + assert is_type_registered("djblob") + + def test_djblob_properties(self): + """Test DJBlobType properties.""" + blob_type = get_type("djblob") + assert blob_type.type_name == "djblob" + assert blob_type.dtype == "longblob" + + def test_djblob_encode_decode_roundtrip(self): + """Test that encode/decode is a proper roundtrip.""" + import numpy as np + + blob_type = get_type("djblob") + + # Test with various data types + test_data = [ + {"key": "value", "number": 42}, + [1, 2, 3, 4, 5], + np.array([1.0, 2.0, 3.0]), + "simple string", + (1, 2, 3), + None, + ] + + for original in test_data: + encoded = blob_type.encode(original) + assert isinstance(encoded, bytes) + decoded = blob_type.decode(encoded) + if isinstance(original, np.ndarray): + np.testing.assert_array_equal(decoded, original) + else: + assert decoded == original + + def test_djblob_encode_produces_valid_blob_format(self): + """Test that encoded data has valid blob protocol header.""" + blob_type = get_type("djblob") + encoded = blob_type.encode({"test": "data"}) + + # Should start with compression prefix or protocol header + valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") + assert any(encoded.startswith(p) for p in valid_prefixes) + + def test_djblob_in_list_types(self): + """Test that djblob appears in list_types.""" + types = list_types() + assert "djblob" in types + + def test_djblob_handles_serialization(self): + """Test that DJBlobType handles serialization internally. + + With the new design: + - Plain longblob columns store/return raw bytes (no serialization) + - handles pack/unpack in encode/decode + """ + blob_type = get_type("djblob") + + # DJBlobType.encode() should produce packed bytes + data = {"key": "value"} + encoded = blob_type.encode(data) + assert isinstance(encoded, bytes) + + # DJBlobType.decode() should unpack back to original + decoded = blob_type.decode(encoded) + assert decoded == data diff --git a/tests/test_bypass_serialization.py b/tests/test_bypass_serialization.py deleted file mode 100644 index da7f0b0e3..000000000 --- a/tests/test_bypass_serialization.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_array_equal - -import datajoint as dj - -test_blob = np.array([1, 2, 3]) - - -class Input(dj.Lookup): - definition = """ - id: int - --- - data: blob - """ - contents = [(0, test_blob)] - - -class Output(dj.Manual): - definition = """ - id: int - --- - data: blob - """ - - -@pytest.fixture -def schema_in(connection_test, prefix): - schema = dj.Schema( - prefix + "_test_bypass_serialization_in", - context=dict(Input=Input), - connection=connection_test, - ) - schema(Input) - yield schema - schema.drop() - - -@pytest.fixture -def schema_out(connection_test, prefix): - schema = dj.Schema( - prefix + "_test_blob_bypass_serialization_out", - context=dict(Output=Output), - connection=connection_test, - ) - schema(Output) - yield schema - schema.drop() - - -def test_bypass_serialization(schema_in, schema_out): - dj.blob.bypass_serialization = True - contents = Input.fetch(as_dict=True) - assert isinstance(contents[0]["data"], bytes) - Output.insert(contents) - dj.blob.bypass_serialization = False - assert_array_equal(Input.fetch1("data"), Output.fetch1("data")) diff --git a/tests/test_content_storage.py b/tests/test_content_storage.py new file mode 100644 index 000000000..e6d0f14cc --- /dev/null +++ b/tests/test_content_storage.py @@ -0,0 +1,231 @@ +""" +Tests for content-addressed storage (content_registry.py). +""" + +import hashlib +from unittest.mock import MagicMock, patch + +import pytest + +from datajoint.content_registry import ( + build_content_path, + compute_content_hash, + content_exists, + delete_content, + get_content, + get_content_size, + put_content, +) +from datajoint.errors import DataJointError + + +class TestComputeContentHash: + """Tests for compute_content_hash function.""" + + def test_computes_sha256(self): + """Test that SHA256 hash is computed correctly.""" + data = b"Hello, World!" + result = compute_content_hash(data) + + # Verify against known SHA256 hash + expected = hashlib.sha256(data).hexdigest() + assert result == expected + assert len(result) == 64 # SHA256 produces 64 hex chars + + def test_empty_bytes(self): + """Test hashing empty bytes.""" + result = compute_content_hash(b"") + expected = hashlib.sha256(b"").hexdigest() + assert result == expected + + def test_different_content_different_hash(self): + """Test that different content produces different hashes.""" + hash1 = compute_content_hash(b"content1") + hash2 = compute_content_hash(b"content2") + assert hash1 != hash2 + + def test_same_content_same_hash(self): + """Test that same content produces same hash.""" + data = b"identical content" + hash1 = compute_content_hash(data) + hash2 = compute_content_hash(data) + assert hash1 == hash2 + + +class TestBuildContentPath: + """Tests for build_content_path function.""" + + def test_builds_hierarchical_path(self): + """Test that path is built with proper hierarchy.""" + # Example hash: abcdef... + test_hash = "abcdef0123456789" * 4 # 64 chars + result = build_content_path(test_hash) + + # Path should be _content/{hash[:2]}/{hash[2:4]}/{hash} + assert result == f"_content/ab/cd/{test_hash}" + + def test_rejects_invalid_hash_length(self): + """Test that invalid hash length raises error.""" + with pytest.raises(DataJointError, match="Invalid content hash length"): + build_content_path("tooshort") + + with pytest.raises(DataJointError, match="Invalid content hash length"): + build_content_path("a" * 65) # Too long + + def test_real_hash_path(self): + """Test path building with a real computed hash.""" + data = b"test content" + content_hash = compute_content_hash(data) + path = build_content_path(content_hash) + + # Verify structure + parts = path.split("/") + assert parts[0] == "_content" + assert len(parts[1]) == 2 + assert len(parts[2]) == 2 + assert len(parts[3]) == 64 + assert parts[1] == content_hash[:2] + assert parts[2] == content_hash[2:4] + assert parts[3] == content_hash + + +class TestPutContent: + """Tests for put_content function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_stores_new_content(self, mock_get_backend): + """Test storing new content.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + data = b"new content" + result = put_content(data, store_name="test_store") + + # Verify return value + assert "hash" in result + assert result["hash"] == compute_content_hash(data) + assert result["store"] == "test_store" + assert result["size"] == len(data) + + # Verify backend was called + mock_backend.put_buffer.assert_called_once() + + @patch("datajoint.content_registry.get_store_backend") + def test_deduplicates_existing_content(self, mock_get_backend): + """Test that existing content is not re-uploaded.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True # Content already exists + mock_get_backend.return_value = mock_backend + + data = b"existing content" + result = put_content(data, store_name="test_store") + + # Verify return value is still correct + assert result["hash"] == compute_content_hash(data) + assert result["size"] == len(data) + + # Verify put_buffer was NOT called (deduplication) + mock_backend.put_buffer.assert_not_called() + + +class TestGetContent: + """Tests for get_content function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_retrieves_content(self, mock_get_backend): + """Test retrieving content by hash.""" + data = b"stored content" + content_hash = compute_content_hash(data) + + mock_backend = MagicMock() + mock_backend.get_buffer.return_value = data + mock_get_backend.return_value = mock_backend + + result = get_content(content_hash, store_name="test_store") + + assert result == data + + @patch("datajoint.content_registry.get_store_backend") + def test_verifies_hash(self, mock_get_backend): + """Test that hash is verified on retrieval.""" + data = b"original content" + content_hash = compute_content_hash(data) + + # Return corrupted data + mock_backend = MagicMock() + mock_backend.get_buffer.return_value = b"corrupted content" + mock_get_backend.return_value = mock_backend + + with pytest.raises(DataJointError, match="Content hash mismatch"): + get_content(content_hash, store_name="test_store") + + +class TestContentExists: + """Tests for content_exists function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_true_when_exists(self, mock_get_backend): + """Test that True is returned when content exists.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + assert content_exists(content_hash, store_name="test_store") is True + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_false_when_not_exists(self, mock_get_backend): + """Test that False is returned when content doesn't exist.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + assert content_exists(content_hash, store_name="test_store") is False + + +class TestDeleteContent: + """Tests for delete_content function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_deletes_existing_content(self, mock_get_backend): + """Test deleting existing content.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + result = delete_content(content_hash, store_name="test_store") + + assert result is True + mock_backend.remove.assert_called_once() + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_false_for_nonexistent(self, mock_get_backend): + """Test that False is returned when content doesn't exist.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + result = delete_content(content_hash, store_name="test_store") + + assert result is False + mock_backend.remove.assert_not_called() + + +class TestGetContentSize: + """Tests for get_content_size function.""" + + @patch("datajoint.content_registry.get_store_backend") + def test_returns_size(self, mock_get_backend): + """Test getting content size.""" + mock_backend = MagicMock() + mock_backend.size.return_value = 1024 + mock_get_backend.return_value = mock_backend + + content_hash = "a" * 64 + result = get_content_size(content_hash, store_name="test_store") + + assert result == 1024 diff --git a/tests/test_gc.py b/tests/test_gc.py new file mode 100644 index 000000000..2c312bcc0 --- /dev/null +++ b/tests/test_gc.py @@ -0,0 +1,337 @@ +""" +Tests for garbage collection (gc.py). +""" + +from unittest.mock import MagicMock, patch + +import pytest + +from datajoint import gc +from datajoint.errors import DataJointError + + +class TestUsesContentStorage: + """Tests for _uses_content_storage helper function.""" + + def test_returns_false_for_no_adapter(self): + """Test that False is returned when attribute has no adapter.""" + attr = MagicMock() + attr.adapter = None + + assert gc._uses_content_storage(attr) is False + + def test_returns_true_for_content_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "content" + + assert gc._uses_content_storage(attr) is True + + def test_returns_true_for_xblob_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "xblob" + + assert gc._uses_content_storage(attr) is True + + def test_returns_true_for_xattach_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "xattach" + + assert gc._uses_content_storage(attr) is True + + def test_returns_false_for_other_types(self): + """Test that False is returned for non-content types.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "djblob" + + assert gc._uses_content_storage(attr) is False + + +class TestExtractContentRefs: + """Tests for _extract_content_refs helper function.""" + + def test_returns_empty_for_none(self): + """Test that empty list is returned for None value.""" + assert gc._extract_content_refs(None) == [] + + def test_parses_json_string(self): + """Test parsing JSON string with hash.""" + value = '{"hash": "abc123", "store": "mystore"}' + refs = gc._extract_content_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("abc123", "mystore") + + def test_parses_dict_directly(self): + """Test parsing dict with hash.""" + value = {"hash": "def456", "store": None} + refs = gc._extract_content_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("def456", None) + + def test_returns_empty_for_invalid_json(self): + """Test that empty list is returned for invalid JSON.""" + assert gc._extract_content_refs("not json") == [] + + def test_returns_empty_for_dict_without_hash(self): + """Test that empty list is returned for dict without hash key.""" + assert gc._extract_content_refs({"other": "data"}) == [] + + +class TestUsesObjectStorage: + """Tests for _uses_object_storage helper function.""" + + def test_returns_false_for_no_adapter(self): + """Test that False is returned when attribute has no adapter.""" + attr = MagicMock() + attr.adapter = None + + assert gc._uses_object_storage(attr) is False + + def test_returns_true_for_object_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "object" + + assert gc._uses_object_storage(attr) is True + + def test_returns_false_for_other_types(self): + """Test that False is returned for non-object types.""" + attr = MagicMock() + attr.adapter = MagicMock() + attr.adapter.type_name = "xblob" + + assert gc._uses_object_storage(attr) is False + + +class TestExtractObjectRefs: + """Tests for _extract_object_refs helper function.""" + + def test_returns_empty_for_none(self): + """Test that empty list is returned for None value.""" + assert gc._extract_object_refs(None) == [] + + def test_parses_json_string(self): + """Test parsing JSON string with path.""" + value = '{"path": "schema/table/objects/pk/field_abc123", "store": "mystore"}' + refs = gc._extract_object_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("schema/table/objects/pk/field_abc123", "mystore") + + def test_parses_dict_directly(self): + """Test parsing dict with path.""" + value = {"path": "test/path", "store": None} + refs = gc._extract_object_refs(value) + + assert len(refs) == 1 + assert refs[0] == ("test/path", None) + + def test_returns_empty_for_dict_without_path(self): + """Test that empty list is returned for dict without path key.""" + assert gc._extract_object_refs({"other": "data"}) == [] + + +class TestScan: + """Tests for scan function.""" + + def test_requires_at_least_one_schema(self): + """Test that at least one schema is required.""" + with pytest.raises(DataJointError, match="At least one schema must be provided"): + gc.scan() + + @patch("datajoint.gc.scan_object_references") + @patch("datajoint.gc.list_stored_objects") + @patch("datajoint.gc.scan_references") + @patch("datajoint.gc.list_stored_content") + def test_returns_stats(self, mock_list_content, mock_scan_refs, mock_list_objects, mock_scan_objects): + """Test that scan returns proper statistics.""" + # Mock content-addressed storage + mock_scan_refs.return_value = {"hash1", "hash2"} + mock_list_content.return_value = { + "hash1": 100, + "hash3": 200, # orphaned + } + + # Mock path-addressed storage + mock_scan_objects.return_value = {"path/to/obj1"} + mock_list_objects.return_value = { + "path/to/obj1": 500, + "path/to/obj2": 300, # orphaned + } + + mock_schema = MagicMock() + stats = gc.scan(mock_schema, store_name="test_store") + + # Content stats + assert stats["content_referenced"] == 2 + assert stats["content_stored"] == 2 + assert stats["content_orphaned"] == 1 + assert "hash3" in stats["orphaned_hashes"] + + # Object stats + assert stats["object_referenced"] == 1 + assert stats["object_stored"] == 2 + assert stats["object_orphaned"] == 1 + assert "path/to/obj2" in stats["orphaned_paths"] + + # Combined totals + assert stats["referenced"] == 3 + assert stats["stored"] == 4 + assert stats["orphaned"] == 2 + assert stats["orphaned_bytes"] == 500 # 200 content + 300 object + + +class TestCollect: + """Tests for collect function.""" + + @patch("datajoint.gc.scan") + def test_dry_run_does_not_delete(self, mock_scan): + """Test that dry_run=True doesn't delete anything.""" + mock_scan.return_value = { + "referenced": 1, + "stored": 2, + "orphaned": 1, + "orphaned_bytes": 100, + "orphaned_hashes": ["orphan_hash"], + "orphaned_paths": [], + "content_orphaned": 1, + "object_orphaned": 0, + } + + mock_schema = MagicMock() + stats = gc.collect(mock_schema, store_name="test_store", dry_run=True) + + assert stats["deleted"] == 0 + assert stats["bytes_freed"] == 0 + assert stats["dry_run"] is True + + @patch("datajoint.gc.delete_content") + @patch("datajoint.gc.list_stored_content") + @patch("datajoint.gc.scan") + def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete): + """Test that orphaned content is deleted when dry_run=False.""" + mock_scan.return_value = { + "referenced": 1, + "stored": 2, + "orphaned": 1, + "orphaned_bytes": 100, + "orphaned_hashes": ["orphan_hash"], + "orphaned_paths": [], + "content_orphaned": 1, + "object_orphaned": 0, + } + mock_list_stored.return_value = {"orphan_hash": 100} + mock_delete.return_value = True + + mock_schema = MagicMock() + stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) + + assert stats["deleted"] == 1 + assert stats["content_deleted"] == 1 + assert stats["bytes_freed"] == 100 + assert stats["dry_run"] is False + mock_delete.assert_called_once_with("orphan_hash", "test_store") + + @patch("datajoint.gc.delete_object") + @patch("datajoint.gc.list_stored_objects") + @patch("datajoint.gc.scan") + def test_deletes_orphaned_objects(self, mock_scan, mock_list_objects, mock_delete): + """Test that orphaned objects are deleted when dry_run=False.""" + mock_scan.return_value = { + "referenced": 1, + "stored": 2, + "orphaned": 1, + "orphaned_bytes": 500, + "orphaned_hashes": [], + "orphaned_paths": ["path/to/orphan"], + "content_orphaned": 0, + "object_orphaned": 1, + } + mock_list_objects.return_value = {"path/to/orphan": 500} + mock_delete.return_value = True + + mock_schema = MagicMock() + stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) + + assert stats["deleted"] == 1 + assert stats["object_deleted"] == 1 + assert stats["bytes_freed"] == 500 + assert stats["dry_run"] is False + mock_delete.assert_called_once_with("path/to/orphan", "test_store") + + +class TestFormatStats: + """Tests for format_stats function.""" + + def test_formats_scan_stats(self): + """Test formatting scan statistics.""" + stats = { + "referenced": 10, + "stored": 15, + "orphaned": 5, + "orphaned_bytes": 1024 * 1024, # 1 MB + "content_referenced": 6, + "content_stored": 8, + "content_orphaned": 2, + "content_orphaned_bytes": 512 * 1024, + "object_referenced": 4, + "object_stored": 7, + "object_orphaned": 3, + "object_orphaned_bytes": 512 * 1024, + } + + result = gc.format_stats(stats) + + assert "Referenced in database: 10" in result + assert "Stored in backend: 15" in result + assert "Orphaned (unreferenced): 5" in result + assert "1.00 MB" in result + # Check for detailed sections + assert "Content-Addressed Storage" in result + assert "Path-Addressed Storage" in result + + def test_formats_collect_stats_dry_run(self): + """Test formatting collect statistics with dry_run.""" + stats = { + "referenced": 10, + "stored": 15, + "orphaned": 5, + "deleted": 0, + "bytes_freed": 0, + "dry_run": True, + } + + result = gc.format_stats(stats) + + assert "DRY RUN" in result + + def test_formats_collect_stats_actual(self): + """Test formatting collect statistics after actual deletion.""" + stats = { + "referenced": 10, + "stored": 15, + "orphaned": 5, + "deleted": 3, + "content_deleted": 2, + "object_deleted": 1, + "bytes_freed": 2 * 1024 * 1024, # 2 MB + "errors": 2, + "dry_run": False, + } + + result = gc.format_stats(stats) + + assert "Deleted: 3" in result + assert "Content: 2" in result + assert "Objects: 1" in result + assert "2.00 MB" in result + assert "Errors: 2" in result diff --git a/tests/test_type_composition.py b/tests/test_type_composition.py new file mode 100644 index 000000000..0b51b3d68 --- /dev/null +++ b/tests/test_type_composition.py @@ -0,0 +1,352 @@ +""" +Tests for type composition (type chain encoding/decoding). + +This tests the → json composition pattern +and similar type chains. +""" + +from datajoint.attribute_type import ( + AttributeType, + _type_registry, + register_type, + resolve_dtype, +) + + +class TestTypeChainResolution: + """Tests for resolving type chains.""" + + def setup_method(self): + """Clear test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_single_type_chain(self): + """Test resolving a single-type chain.""" + + @register_type + class TestSingle(AttributeType): + type_name = "test_single" + dtype = "varchar(100)" + + def encode(self, value, *, key=None, store_name=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].type_name == "test_single" + assert store is None + + def test_two_type_chain(self): + """Test resolving a two-type chain.""" + + @register_type + class TestInner(AttributeType): + type_name = "test_inner" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestOuter(AttributeType): + type_name = "test_outer" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "longblob" + assert len(chain) == 2 + assert chain[0].type_name == "test_outer" + assert chain[1].type_name == "test_inner" + + def test_three_type_chain(self): + """Test resolving a three-type chain.""" + + @register_type + class TestBase(AttributeType): + type_name = "test_base" + dtype = "json" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestMiddle(AttributeType): + type_name = "test_middle" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestTop(AttributeType): + type_name = "test_top" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 3 + assert chain[0].type_name == "test_top" + assert chain[1].type_name == "test_middle" + assert chain[2].type_name == "test_base" + + +class TestTypeChainEncodeDecode: + """Tests for encode/decode through type chains.""" + + def setup_method(self): + """Clear test types from registry before each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def teardown_method(self): + """Clean up test types after each test.""" + for name in list(_type_registry.keys()): + if name.startswith("test_"): + del _type_registry[name] + + def test_encode_order(self): + """Test that encode is applied outer → inner.""" + encode_order = [] + + @register_type + class TestInnerEnc(AttributeType): + type_name = "test_inner_enc" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + encode_order.append("inner") + return value + b"_inner" + + def decode(self, stored, *, key=None): + return stored + + @register_type + class TestOuterEnc(AttributeType): + type_name = "test_outer_enc" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + encode_order.append("outer") + return value + b"_outer" + + def decode(self, stored, *, key=None): + return stored + + _, chain, _ = resolve_dtype("") + + # Apply encode in order: outer first, then inner + value = b"start" + for attr_type in chain: + value = attr_type.encode(value) + + assert encode_order == ["outer", "inner"] + assert value == b"start_outer_inner" + + def test_decode_order(self): + """Test that decode is applied inner → outer (reverse of encode).""" + decode_order = [] + + @register_type + class TestInnerDec(AttributeType): + type_name = "test_inner_dec" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + decode_order.append("inner") + return stored.replace(b"_inner", b"") + + @register_type + class TestOuterDec(AttributeType): + type_name = "test_outer_dec" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + decode_order.append("outer") + return stored.replace(b"_outer", b"") + + _, chain, _ = resolve_dtype("") + + # Apply decode in reverse order: inner first, then outer + value = b"start_outer_inner" + for attr_type in reversed(chain): + value = attr_type.decode(value) + + assert decode_order == ["inner", "outer"] + assert value == b"start" + + def test_roundtrip(self): + """Test encode/decode roundtrip through a type chain.""" + + @register_type + class TestInnerRt(AttributeType): + type_name = "test_inner_rt" + dtype = "longblob" + + def encode(self, value, *, key=None, store_name=None): + # Compress (just add prefix for testing) + return b"COMPRESSED:" + value + + def decode(self, stored, *, key=None): + # Decompress + return stored.replace(b"COMPRESSED:", b"") + + @register_type + class TestOuterRt(AttributeType): + type_name = "test_outer_rt" + dtype = "" + + def encode(self, value, *, key=None, store_name=None): + # Serialize (just encode string for testing) + return str(value).encode("utf-8") + + def decode(self, stored, *, key=None): + # Deserialize + return stored.decode("utf-8") + + _, chain, _ = resolve_dtype("") + + # Original value + original = "test data" + + # Encode: outer → inner + encoded = original + for attr_type in chain: + encoded = attr_type.encode(encoded) + + assert encoded == b"COMPRESSED:test data" + + # Decode: inner → outer (reversed) + decoded = encoded + for attr_type in reversed(chain): + decoded = attr_type.decode(decoded) + + assert decoded == original + + +class TestBuiltinTypeComposition: + """Tests for built-in type composition.""" + + def test_xblob_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 2 + assert chain[0].type_name == "xblob" + assert chain[1].type_name == "content" + + def test_xattach_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 2 + assert chain[0].type_name == "xattach" + assert chain[1].type_name == "content" + + def test_djblob_resolves_to_longblob(self): + """Test that → longblob (no chain).""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "longblob" + assert len(chain) == 1 + assert chain[0].type_name == "djblob" + + def test_content_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].type_name == "content" + + def test_object_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].type_name == "object" + + def test_attach_resolves_to_longblob(self): + """Test that → longblob.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "longblob" + assert len(chain) == 1 + assert chain[0].type_name == "attach" + + def test_filepath_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].type_name == "filepath" + + +class TestStoreNameParsing: + """Tests for store name parsing in type specs.""" + + def test_type_with_store(self): + """Test parsing type with store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert store == "mystore" + + def test_type_without_store(self): + """Test parsing type without store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert store is None + + def test_filepath_with_store(self): + """Test parsing filepath with store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert store == "s3store"