From 7eefea8ea2afa10f5fff02469260894b1f7b98f8 Mon Sep 17 00:00:00 2001 From: deanban <3989225+deanban@users.noreply.github.com> Date: Fri, 8 May 2026 14:46:40 -0400 Subject: [PATCH] feat: target ontology adapter protocol and manifest loader Adds TargetOntologyAdapter protocol with declarative target-side DTOs, three-stage loader (normalize, hash, materialize), per-facet enrichment decisions, hash-versioned identity for schema-bearing nodes, and ManifestTargetAdapter as the protocol's golden compliance fixture. Signed-off-by: deanban <3989225+deanban@users.noreply.github.com> --- pyproject.toml | 2 + src/sema/cli.py | 3 + src/sema/cli_target.py | 92 ++++ src/sema/cli_target_utils.py | 48 ++ src/sema/graph/target_loader_migrations.py | 131 +++++ src/sema/models/target/__init__.py | 8 + src/sema/models/target/completeness.py | 29 + src/sema/models/target/context_card.py | 73 +++ src/sema/models/target/descriptor.py | 25 + src/sema/models/target/endpoints.py | 31 ++ src/sema/models/target/enrichment.py | 57 ++ src/sema/models/target/entity.py | 34 ++ src/sema/models/target/loaded.py | 26 + src/sema/models/target/normalized.py | 25 + src/sema/models/target/obligation.py | 7 + src/sema/models/target/properties.py | 59 +++ src/sema/models/target/refs.py | 42 ++ src/sema/models/target/term.py | 16 + src/sema/models/target/vocab_binding.py | 20 + src/sema/targets/__init__.py | 52 ++ src/sema/targets/adapters/__init__.py | 3 + src/sema/targets/adapters/manifest.py | 157 ++++++ .../targets/adapters/manifest_exceptions.py | 35 ++ src/sema/targets/adapters/manifest_models.py | 161 ++++++ src/sema/targets/adapters/manifest_parser.py | 149 ++++++ src/sema/targets/adapters/manifest_utils.py | 366 +++++++++++++ src/sema/targets/base.py | 58 ++ src/sema/targets/exceptions.py | 43 ++ src/sema/targets/hashing.py | 82 +++ src/sema/targets/hashing_utils.py | 57 ++ src/sema/targets/loader.py | 150 ++++++ src/sema/targets/loader_utils.py | 190 +++++++ src/sema/targets/materializer.py | 200 +++++++ .../materializer_binding_card_utils.py | 140 +++++ src/sema/targets/materializer_ops.py | 164 ++++++ src/sema/targets/materializer_utils.py | 340 ++++++++++++ src/sema/targets/neo4j_writer.py | 99 ++++ src/sema/targets/neo4j_writer_flip_utils.py | 182 +++++++ src/sema/targets/neo4j_writer_utils.py | 377 +++++++++++++ src/sema/targets/normalizer.py | 123 +++++ src/sema/targets/normalizer_utils.py | 162 ++++++ src/sema/targets/registry.py | 231 ++++++++ src/sema/targets/registry_utils.py | 125 +++++ tests/integration/targets/__init__.py | 0 .../targets/test_target_loader_round_trip.py | 372 +++++++++++++ tests/unit/models/target/__init__.py | 0 tests/unit/models/target/conftest.py | 3 + tests/unit/models/target/test_completeness.py | 57 ++ tests/unit/models/target/test_context_card.py | 93 ++++ tests/unit/models/target/test_descriptor.py | 69 +++ tests/unit/models/target/test_dto_meta.py | 41 ++ tests/unit/models/target/test_endpoints.py | 31 ++ tests/unit/models/target/test_entity_decl.py | 79 +++ .../models/target/test_obligation_reexport.py | 15 + .../unit/models/target/test_property_decl.py | 68 +++ tests/unit/models/target/test_refs.py | 62 +++ tests/unit/models/target/test_term_decl.py | 30 ++ .../unit/models/target/test_vocab_binding.py | 56 ++ tests/unit/targets/__init__.py | 0 tests/unit/targets/conftest.py | 280 ++++++++++ .../targets/fixtures/golden_manifest.yaml | 162 ++++++ .../targets/fixtures/golden_manifest_hash.txt | 1 + tests/unit/targets/test_cross_capability.py | 172 ++++++ tests/unit/targets/test_golden_manifest.py | 239 +++++++++ tests/unit/targets/test_hashing.py | 136 +++++ tests/unit/targets/test_import_boundaries.py | 166 ++++++ tests/unit/targets/test_loader.py | 327 ++++++++++++ tests/unit/targets/test_manifest_adapter.py | 498 ++++++++++++++++++ .../targets/test_manifest_registration.py | 69 +++ tests/unit/targets/test_materializer.py | 485 +++++++++++++++++ tests/unit/targets/test_neo4j_writer.py | 371 +++++++++++++ tests/unit/targets/test_normalizer.py | 289 ++++++++++ tests/unit/targets/test_protocol.py | 65 +++ tests/unit/targets/test_registry.py | 174 ++++++ .../targets/test_target_loader_migrations.py | 138 +++++ tests/unit/test_cli_target.py | 80 +++ uv.lock | 4 + 77 files changed, 9006 insertions(+) create mode 100644 src/sema/cli_target.py create mode 100644 src/sema/cli_target_utils.py create mode 100644 src/sema/graph/target_loader_migrations.py create mode 100644 src/sema/models/target/__init__.py create mode 100644 src/sema/models/target/completeness.py create mode 100644 src/sema/models/target/context_card.py create mode 100644 src/sema/models/target/descriptor.py create mode 100644 src/sema/models/target/endpoints.py create mode 100644 src/sema/models/target/enrichment.py create mode 100644 src/sema/models/target/entity.py create mode 100644 src/sema/models/target/loaded.py create mode 100644 src/sema/models/target/normalized.py create mode 100644 src/sema/models/target/obligation.py create mode 100644 src/sema/models/target/properties.py create mode 100644 src/sema/models/target/refs.py create mode 100644 src/sema/models/target/term.py create mode 100644 src/sema/models/target/vocab_binding.py create mode 100644 src/sema/targets/__init__.py create mode 100644 src/sema/targets/adapters/__init__.py create mode 100644 src/sema/targets/adapters/manifest.py create mode 100644 src/sema/targets/adapters/manifest_exceptions.py create mode 100644 src/sema/targets/adapters/manifest_models.py create mode 100644 src/sema/targets/adapters/manifest_parser.py create mode 100644 src/sema/targets/adapters/manifest_utils.py create mode 100644 src/sema/targets/base.py create mode 100644 src/sema/targets/exceptions.py create mode 100644 src/sema/targets/hashing.py create mode 100644 src/sema/targets/hashing_utils.py create mode 100644 src/sema/targets/loader.py create mode 100644 src/sema/targets/loader_utils.py create mode 100644 src/sema/targets/materializer.py create mode 100644 src/sema/targets/materializer_binding_card_utils.py create mode 100644 src/sema/targets/materializer_ops.py create mode 100644 src/sema/targets/materializer_utils.py create mode 100644 src/sema/targets/neo4j_writer.py create mode 100644 src/sema/targets/neo4j_writer_flip_utils.py create mode 100644 src/sema/targets/neo4j_writer_utils.py create mode 100644 src/sema/targets/normalizer.py create mode 100644 src/sema/targets/normalizer_utils.py create mode 100644 src/sema/targets/registry.py create mode 100644 src/sema/targets/registry_utils.py create mode 100644 tests/integration/targets/__init__.py create mode 100644 tests/integration/targets/test_target_loader_round_trip.py create mode 100644 tests/unit/models/target/__init__.py create mode 100644 tests/unit/models/target/conftest.py create mode 100644 tests/unit/models/target/test_completeness.py create mode 100644 tests/unit/models/target/test_context_card.py create mode 100644 tests/unit/models/target/test_descriptor.py create mode 100644 tests/unit/models/target/test_dto_meta.py create mode 100644 tests/unit/models/target/test_endpoints.py create mode 100644 tests/unit/models/target/test_entity_decl.py create mode 100644 tests/unit/models/target/test_obligation_reexport.py create mode 100644 tests/unit/models/target/test_property_decl.py create mode 100644 tests/unit/models/target/test_refs.py create mode 100644 tests/unit/models/target/test_term_decl.py create mode 100644 tests/unit/models/target/test_vocab_binding.py create mode 100644 tests/unit/targets/__init__.py create mode 100644 tests/unit/targets/conftest.py create mode 100644 tests/unit/targets/fixtures/golden_manifest.yaml create mode 100644 tests/unit/targets/fixtures/golden_manifest_hash.txt create mode 100644 tests/unit/targets/test_cross_capability.py create mode 100644 tests/unit/targets/test_golden_manifest.py create mode 100644 tests/unit/targets/test_hashing.py create mode 100644 tests/unit/targets/test_import_boundaries.py create mode 100644 tests/unit/targets/test_loader.py create mode 100644 tests/unit/targets/test_manifest_adapter.py create mode 100644 tests/unit/targets/test_manifest_registration.py create mode 100644 tests/unit/targets/test_materializer.py create mode 100644 tests/unit/targets/test_neo4j_writer.py create mode 100644 tests/unit/targets/test_normalizer.py create mode 100644 tests/unit/targets/test_protocol.py create mode 100644 tests/unit/targets/test_registry.py create mode 100644 tests/unit/targets/test_target_loader_migrations.py create mode 100644 tests/unit/test_cli_target.py diff --git a/pyproject.toml b/pyproject.toml index 2c37f04..b61e5d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "loguru>=0.7.3", "duckdb>=1.0.0", "pyarrow>=14.0.0", + "pyyaml>=6.0", + "packaging>=24.0", ] [project.scripts] diff --git a/src/sema/cli.py b/src/sema/cli.py index 4bbaf7c..3658c13 100644 --- a/src/sema/cli.py +++ b/src/sema/cli.py @@ -340,6 +340,9 @@ def query( sys.exit(1) +from sema.cli_target import target_group as _target_group + cli.add_command(_ingest_group, name="ingest") cli.add_command(_push_cmd, name="push") cli.add_command(_eval_group, name="eval") +cli.add_command(_target_group, name="target") diff --git a/src/sema/cli_target.py b/src/sema/cli_target.py new file mode 100644 index 0000000..ffe62d3 --- /dev/null +++ b/src/sema/cli_target.py @@ -0,0 +1,92 @@ +"""CLI: `sema target load --manifest `. + +Connects any user-supplied ontology manifest to the target loader and +prints a `LoadedTarget` summary as JSON. Defaults to the in-memory +writer for fast inspection; `--writer neo4j` materialises into the +graph using `Neo4jGraphWriter`. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +import click + +from sema.cli_target_utils import build_summary +from sema.targets.adapters.manifest import ( + ManifestTargetAdapter, + register_manifest_adapter, +) +from sema.targets.loader import load_target +from sema.targets.materializer import GraphWriter, InMemoryGraphWriter + + +@click.group(name="target") +def target_group() -> None: + """Target ontology operations.""" + register_manifest_adapter() + + +@target_group.command(name="load") +@click.option( + "--manifest", + "manifest_path", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + required=True, + help="Path to the YAML/JSON manifest declaring the target ontology.", +) +@click.option( + "--writer", + type=click.Choice(["in-memory", "neo4j"]), + default="in-memory", + show_default=True, + help="Where to materialise the target. `in-memory` records ops without writing.", +) +@click.option( + "--skip-facet", + "skip_facets", + multiple=True, + help="Repeatable. Operator opt-out per facet (e.g. semantic_aliases).", +) +def load_cmd( + manifest_path: Path, writer: str, skip_facets: tuple[str, ...] +) -> None: + """Load a manifest and print a LoadedTarget summary as JSON.""" + try: + adapter = ManifestTargetAdapter(manifest_path) + graph_writer = _build_writer(writer) + loaded = load_target( + adapter, writer=graph_writer, skip_facets=list(skip_facets) + ) + except FileNotFoundError as exc: + click.echo(f"Error: {exc}", err=True) + sys.exit(2) + except Exception as exc: + click.echo(f"Error: {exc}", err=True) + sys.exit(1) + click.echo(json.dumps(build_summary(loaded), indent=2, default=str)) + + +def _build_writer(kind: str) -> GraphWriter: + if kind == "in-memory": + return InMemoryGraphWriter() + if kind == "neo4j": + return _neo4j_writer_from_env() + raise click.BadParameter(f"unknown writer: {kind!r}") + + +def _neo4j_writer_from_env() -> GraphWriter: + neo4j = __import__("neo4j") + from sema.targets.neo4j_writer import Neo4jGraphWriter + + uri = os.getenv("NEO4J_URI", "bolt://localhost:7687") + user = os.getenv("NEO4J_USER", "neo4j") + password = os.getenv("NEO4J_PASSWORD", "graphrag") + driver = neo4j.GraphDatabase.driver(uri, auth=(user, password)) + return Neo4jGraphWriter(driver) + + +__all__ = ["target_group", "load_cmd"] diff --git a/src/sema/cli_target_utils.py b/src/sema/cli_target_utils.py new file mode 100644 index 0000000..37d4478 --- /dev/null +++ b/src/sema/cli_target_utils.py @@ -0,0 +1,48 @@ +"""Helpers for `cli_target.py`: shape `LoadedTarget` for CLI output.""" + +from __future__ import annotations + +from typing import Any + +from sema.models.target.loaded import LoadedTarget + + +def build_summary(loaded: LoadedTarget) -> dict[str, Any]: + return { + "target_model_id": loaded.descriptor.target_model_id, + "target_model_version": loaded.descriptor.target_model_version, + "target_schema_snapshot_hash": loaded.target_schema_snapshot_hash, + "aggregate_context_card_version": loaded.aggregate_context_card_version, + "materialized_at": loaded.materialized_at.isoformat(), + "entities": [ + {"qualified_name": e.qualified_name, "kind": e.kind.value} + for e in loaded.entity_refs + ], + "enrichment_decisions": [ + _decision_to_dict(d) for d in loaded.enrichment_decisions + ], + "context_cards": [ + { + "entity_ref": c.entity_ref.qualified_name, + "card_version": c.card_version, + "card_hash": c.card_hash, + } + for c in loaded.context_cards + ], + } + + +def _decision_to_dict(record: Any) -> dict[str, Any]: + return { + "entity_ref": record.entity_ref.qualified_name, + "decisions": { + facet.value: { + "status": fd.status.value, + "reason": fd.reason, + } + for facet, fd in record.decisions.items() + }, + } + + +__all__ = ["build_summary"] diff --git a/src/sema/graph/target_loader_migrations.py b/src/sema/graph/target_loader_migrations.py new file mode 100644 index 0000000..c113ae9 --- /dev/null +++ b/src/sema/graph/target_loader_migrations.py @@ -0,0 +1,131 @@ +"""Cypher migrations for the target-model-loader storage shape. + +Extends the planner contract's `planner-graph-storage` migrations with +the target-side schema artifacts produced by `TargetModelMaterializer`: +the `EnrichmentDecision` label, hash-versioned uniqueness constraints, +and indexes used by enrichment-status query workloads. +""" + +from __future__ import annotations + + +_FACETS = ( + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", +) + + +def cypher_up() -> list[str]: + """Forward migration: target-loader uniqueness constraints + indexes.""" + statements: list[str] = [] + statements.extend(_uniqueness_constraints()) + statements.extend(_indexes()) + return statements + + +def cypher_down() -> list[str]: + """Reverse migration: drop target-loader constraints, indexes, labels.""" + statements: list[str] = [] + statements.extend( + f"DROP INDEX entity_enrichment_{f}_status IF EXISTS" for f in _FACETS + ) + statements.extend( + [ + "DROP INDEX entity_is_current IF EXISTS", + "DROP INDEX property_property_kind IF EXISTS", + "DROP INDEX property_is_current IF EXISTS", + "DROP INDEX target_obligation_is_current IF EXISTS", + "DROP INDEX target_term_is_current IF EXISTS", + "DROP INDEX target_constraint_is_current IF EXISTS", + "DROP INDEX target_vocab_binding_is_current IF EXISTS", + "DROP INDEX target_context_card_is_current IF EXISTS", + "DROP CONSTRAINT enrichment_decision_unique IF EXISTS", + "DROP CONSTRAINT target_entity_hash_unique IF EXISTS", + "DROP CONSTRAINT target_property_hash_unique IF EXISTS", + "DROP CONSTRAINT target_obligation_hash_unique IF EXISTS", + "DROP CONSTRAINT target_term_hash_unique IF EXISTS", + "DROP CONSTRAINT target_constraint_hash_unique IF EXISTS", + "DROP CONSTRAINT target_vocab_binding_hash_unique IF EXISTS", + "DROP CONSTRAINT target_context_card_hash_unique IF EXISTS", + "MATCH (n:EnrichmentDecision) DETACH DELETE n", + "MATCH (n:VocabularyBinding) DETACH DELETE n", + "MATCH (n:ContextCard) DETACH DELETE n", + ] + ) + return statements + + +def _uniqueness_constraints() -> list[str]: + return [ + "CREATE CONSTRAINT enrichment_decision_unique IF NOT EXISTS " + "FOR (n:EnrichmentDecision) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.entity_ref) IS UNIQUE", + "CREATE CONSTRAINT target_entity_hash_unique IF NOT EXISTS " + "FOR (n:Entity) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.qualified_name) IS UNIQUE", + "CREATE CONSTRAINT target_property_hash_unique IF NOT EXISTS " + "FOR (n:Property) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.parent_entity_qualified_name, n.name) " + "IS UNIQUE", + "CREATE CONSTRAINT target_obligation_hash_unique IF NOT EXISTS " + "FOR (n:TargetObligation) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.target_entity) IS UNIQUE", + "CREATE CONSTRAINT target_term_hash_unique IF NOT EXISTS " + "FOR (n:Term) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.vocabulary_name, n.code) IS UNIQUE", + "CREATE CONSTRAINT target_constraint_hash_unique IF NOT EXISTS " + "FOR (n:Constraint) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.attached_property_id, " + "n.constraint_kind, n.payload_hash) IS UNIQUE", + "CREATE CONSTRAINT target_vocab_binding_hash_unique IF NOT EXISTS " + "FOR (n:VocabularyBinding) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.parent_entity_qualified_name, " + "n.property_name, n.vocabulary_name) IS UNIQUE", + "CREATE CONSTRAINT target_context_card_hash_unique IF NOT EXISTS " + "FOR (n:ContextCard) " + "REQUIRE (n.target_model_id, n.target_model_version, " + "n.target_schema_snapshot_hash, n.entity_qualified_name, " + "n.card_version) IS UNIQUE", + ] + + +def _indexes() -> list[str]: + indexes = [ + f"CREATE INDEX entity_enrichment_{f}_status IF NOT EXISTS " + f"FOR (n:Entity) ON (n.enrichment_{f}_status)" + for f in _FACETS + ] + indexes.extend( + [ + "CREATE INDEX entity_is_current IF NOT EXISTS " + "FOR (n:Entity) ON (n.is_current)", + "CREATE INDEX property_property_kind IF NOT EXISTS " + "FOR (n:Property) ON (n.property_kind)", + "CREATE INDEX property_is_current IF NOT EXISTS " + "FOR (n:Property) ON (n.is_current)", + "CREATE INDEX target_obligation_is_current IF NOT EXISTS " + "FOR (n:TargetObligation) ON (n.is_current)", + "CREATE INDEX target_term_is_current IF NOT EXISTS " + "FOR (n:Term) ON (n.is_current)", + "CREATE INDEX target_constraint_is_current IF NOT EXISTS " + "FOR (n:Constraint) ON (n.is_current)", + "CREATE INDEX target_vocab_binding_is_current IF NOT EXISTS " + "FOR (n:VocabularyBinding) ON (n.is_current)", + "CREATE INDEX target_context_card_is_current IF NOT EXISTS " + "FOR (n:ContextCard) ON (n.is_current)", + ] + ) + return indexes + + +__all__ = ["cypher_up", "cypher_down"] diff --git a/src/sema/models/target/__init__.py b/src/sema/models/target/__init__.py new file mode 100644 index 0000000..ab76e40 --- /dev/null +++ b/src/sema/models/target/__init__.py @@ -0,0 +1,8 @@ +"""Target-side declarative DTOs. + +Adapter-facing models describing what a target ontology requires. +Distinct from `sema.models.planner.target_model`, which defines the +planner-contract graph shape (ModelRole, TargetObligation, etc.). +""" + +from __future__ import annotations diff --git a/src/sema/models/target/completeness.py b/src/sema/models/target/completeness.py new file mode 100644 index 0000000..3179b84 --- /dev/null +++ b/src/sema/models/target/completeness.py @@ -0,0 +1,29 @@ +"""Per-facet semantic-completeness annotations. + +A target ontology can be authoritative about some facets and silent about +others; per-facet annotations let the enrichment runner make per-facet +decisions instead of binary all-or-nothing. +""" + +from __future__ import annotations + +from enum import Enum + +from pydantic import BaseModel, ConfigDict + + +class SemanticCompleteness(str, Enum): + COMPLETE = "COMPLETE" + PARTIAL = "PARTIAL" + NONE = "NONE" + EXTERNAL = "EXTERNAL" + + +class SemanticCompletenessAnnotations(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + structure: SemanticCompleteness + obligations: SemanticCompleteness + vocabulary_bindings: SemanticCompleteness + semantic_aliases: SemanticCompleteness + terms: SemanticCompleteness diff --git a/src/sema/models/target/context_card.py b/src/sema/models/target/context_card.py new file mode 100644 index 0000000..9ce5223 --- /dev/null +++ b/src/sema/models/target/context_card.py @@ -0,0 +1,73 @@ +"""TargetContextCard — stable target-Entity prefix material for prompts.""" + +from __future__ import annotations + +from typing import Self + +from packaging.version import InvalidVersion, Version +from pydantic import BaseModel, ConfigDict, Field, model_validator + +from sema.models.target.refs import TargetEntityRef + + +class TargetContextCard(BaseModel): + """Adapter-constructed context card. Adapters MUST set + `card_hash=None`; the loader populates the hash via + `LoadedContextCard.from_target_card`. + """ + + model_config = ConfigDict(extra="forbid", frozen=True) + + entity_ref: TargetEntityRef + card_version: str = Field(min_length=1) + description: str = Field(min_length=1, max_length=4000) + examples: list[str] = Field(default_factory=list) + obligation_summary: str | None = None + curated_synonyms: list[str] = Field(default_factory=list) + card_hash: None = None + + @model_validator(mode="after") + def _validate_construction_invariants(self) -> Self: + if self.card_hash is not None: + raise ValueError( + "card_hash is owned by Sema; adapters MUST construct cards with " + "card_hash=None and let the loader populate it" + ) + try: + Version(self.card_version) + except InvalidVersion as exc: + raise ValueError( + f"card_version {self.card_version!r} is not a PEP 440-parseable version" + ) from exc + return self + + +class LoadedContextCard(BaseModel): + """Loader-populated context card carrying the Sema-computed + `card_hash`. Constructed only by the loader from a validated + `TargetContextCard`; adapters never instantiate this class. + """ + + model_config = ConfigDict(extra="forbid", frozen=True) + + entity_ref: TargetEntityRef + card_version: str = Field(min_length=1) + description: str = Field(min_length=1, max_length=4000) + examples: list[str] = Field(default_factory=list) + obligation_summary: str | None = None + curated_synonyms: list[str] = Field(default_factory=list) + card_hash: str = Field(min_length=64, max_length=64) + + @classmethod + def from_target_card( + cls, source: TargetContextCard, card_hash: str + ) -> "LoadedContextCard": + return cls( + entity_ref=source.entity_ref, + card_version=source.card_version, + description=source.description, + examples=list(source.examples), + obligation_summary=source.obligation_summary, + curated_synonyms=list(source.curated_synonyms), + card_hash=card_hash, + ) diff --git a/src/sema/models/target/descriptor.py b/src/sema/models/target/descriptor.py new file mode 100644 index 0000000..45973d0 --- /dev/null +++ b/src/sema/models/target/descriptor.py @@ -0,0 +1,25 @@ +"""TargetModelDescriptor — adapter-supplied identity + completeness.""" + +from __future__ import annotations + +from typing import Annotated + +from pydantic import BaseModel, ConfigDict, Field, StringConstraints + +from sema.models.target.completeness import SemanticCompletenessAnnotations + +KebabCaseId = Annotated[ + str, + StringConstraints(pattern=r"^[a-z][a-z0-9-]*$"), +] + + +class TargetModelDescriptor(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + target_model_id: KebabCaseId + target_model_version: str = Field(min_length=1) + display_name: str = Field(min_length=1) + owner: str | None = None + vocabulary_release: str | None = None + completeness: SemanticCompletenessAnnotations diff --git a/src/sema/models/target/endpoints.py b/src/sema/models/target/endpoints.py new file mode 100644 index 0000000..a934ee2 --- /dev/null +++ b/src/sema/models/target/endpoints.py @@ -0,0 +1,31 @@ +"""GRAPH_EDGE endpoint declarations. + +Adapters declare `endpoints.subject` and `endpoints.object` on +`GRAPH_EDGE` entities. The normalizer compiles these into reserved +endpoint `Property` instances; adapters never construct endpoint +properties directly. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict + +from sema.models.target.refs import TargetEntityRef + + +class EdgeEndpointDecl(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + role: Literal["subject", "object"] + target_entity: TargetEntityRef + cardinality: Literal["one", "many"] = "one" + nullable: bool = False + + +class EdgeEndpointsDecl(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + subject: EdgeEndpointDecl + object: EdgeEndpointDecl diff --git a/src/sema/models/target/enrichment.py b/src/sema/models/target/enrichment.py new file mode 100644 index 0000000..fb3de5f --- /dev/null +++ b/src/sema/models/target/enrichment.py @@ -0,0 +1,57 @@ +"""Enrichment decision shape: facet, status, per-facet decision, per-entity record.""" + +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Self + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +from sema.models.target.refs import TargetEntityRef + + +class Facet(str, Enum): + structure = "structure" + obligations = "obligations" + vocabulary_bindings = "vocabulary_bindings" + semantic_aliases = "semantic_aliases" + terms = "terms" + + +class EnrichmentStatus(str, Enum): + not_required = "not_required" + required_deferred = "required_deferred" + required_skipped = "required_skipped" + supplied_by_adapter = "supplied_by_adapter" + + +_ALL_FACETS: frozenset[Facet] = frozenset(Facet) + + +class FacetDecision(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + status: EnrichmentStatus + reason: str = Field(min_length=1) + decided_at: datetime + + +class EnrichmentDecisionRecord(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + entity_ref: TargetEntityRef + decisions: dict[Facet, FacetDecision] + decided_at: datetime + + @model_validator(mode="after") + def _validate_all_facets_present(self) -> Self: + keys = frozenset(self.decisions.keys()) + if keys != _ALL_FACETS: + missing = sorted(f.value for f in (_ALL_FACETS - keys)) + extra = sorted(f.value for f in (keys - _ALL_FACETS)) + raise ValueError( + f"EnrichmentDecisionRecord.decisions must cover exactly the five facets; " + f"missing={missing} extra={extra}" + ) + return self diff --git a/src/sema/models/target/entity.py b/src/sema/models/target/entity.py new file mode 100644 index 0000000..865f1b5 --- /dev/null +++ b/src/sema/models/target/entity.py @@ -0,0 +1,34 @@ +"""TargetEntityDecl — entity-shape declaration with optional edge endpoints.""" + +from __future__ import annotations + +from typing import Self + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.completeness import SemanticCompletenessAnnotations +from sema.models.target.endpoints import EdgeEndpointsDecl +from sema.models.target.properties import TargetPropertyDecl +from sema.models.target.refs import TargetEntityRef + + +class TargetEntityDecl(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + ref: TargetEntityRef + properties: list[TargetPropertyDecl] = Field(default_factory=list) + completeness: SemanticCompletenessAnnotations | None = None + endpoints: EdgeEndpointsDecl | None = None + + @model_validator(mode="after") + def _validate_endpoints_kind_invariant(self) -> Self: + is_edge = self.ref.kind is TargetArtifactKind.GRAPH_EDGE + if is_edge and self.endpoints is None: + raise ValueError("GRAPH_EDGE entity requires endpoints") + if not is_edge and self.endpoints is not None: + raise ValueError( + f"endpoints is only valid for GRAPH_EDGE entities; " + f"kind={self.ref.kind.value} forbids endpoints" + ) + return self diff --git a/src/sema/models/target/loaded.py b/src/sema/models/target/loaded.py new file mode 100644 index 0000000..d16cf4e --- /dev/null +++ b/src/sema/models/target/loaded.py @@ -0,0 +1,26 @@ +"""LoadedTarget — value object returned by load_target().""" + +from __future__ import annotations + +from datetime import datetime + +from pydantic import BaseModel, ConfigDict, Field + +from sema.models.target.context_card import LoadedContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.enrichment import EnrichmentDecisionRecord +from sema.models.target.refs import TargetEntityRef + + +class LoadedTarget(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + descriptor: TargetModelDescriptor + target_schema_snapshot_hash: str = Field(min_length=64, max_length=64) + entity_refs: list[TargetEntityRef] = Field(default_factory=list) + enrichment_decisions: list[EnrichmentDecisionRecord] = Field(default_factory=list) + card_versions: dict[str, str] = Field(default_factory=dict) + aggregate_context_card_version: str = Field(min_length=1) + context_cards: list[LoadedContextCard] = Field(default_factory=list) + card_hashes: dict[str, str] = Field(default_factory=dict) + materialized_at: datetime diff --git a/src/sema/models/target/normalized.py b/src/sema/models/target/normalized.py new file mode 100644 index 0000000..3b7d207 --- /dev/null +++ b/src/sema/models/target/normalized.py @@ -0,0 +1,25 @@ +"""NormalizedTargetModel — sorted, cross-resolved adapter output.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field + +from sema.models.target.context_card import TargetContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.refs import VocabularyRef +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl + + +class NormalizedTargetModel(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + descriptor: TargetModelDescriptor + entities: list[TargetEntityDecl] = Field(default_factory=list) + obligations: list[TargetObligationDecl] = Field(default_factory=list) + vocabularies: list[VocabularyRef] = Field(default_factory=list) + vocabulary_bindings: list[VocabularyBindingDecl] = Field(default_factory=list) + terms: list[TargetTermDecl] = Field(default_factory=list) + context_cards: list[TargetContextCard] = Field(default_factory=list) diff --git a/src/sema/models/target/obligation.py b/src/sema/models/target/obligation.py new file mode 100644 index 0000000..04218d6 --- /dev/null +++ b/src/sema/models/target/obligation.py @@ -0,0 +1,7 @@ +"""Re-export the planner contract's TargetObligation as TargetObligationDecl.""" + +from __future__ import annotations + +from sema.models.planner.target_model import TargetObligation as TargetObligationDecl + +__all__ = ["TargetObligationDecl"] diff --git a/src/sema/models/target/properties.py b/src/sema/models/target/properties.py new file mode 100644 index 0000000..88c8969 --- /dev/null +++ b/src/sema/models/target/properties.py @@ -0,0 +1,59 @@ +"""TargetPropertyDecl with optional endpoint-property typing fields.""" + +from __future__ import annotations + +from enum import Enum +from typing import Literal, Self + +from pydantic import BaseModel, ConfigDict, Field, model_validator + +_RESERVED_ENDPOINT_NAMES = frozenset({"subject", "object"}) + + +class PropertyKind(str, Enum): + COLUMN = "COLUMN" + ENDPOINT = "ENDPOINT" + + +class TargetPropertyDecl(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + name: str = Field(min_length=1) + type: str = Field(min_length=1) + nullable: bool + synonyms: list[str] = Field(default_factory=list) + decoded_values: dict[str, str] = Field(default_factory=dict) + vocabulary_binding: str | None = None + property_kind: PropertyKind = PropertyKind.COLUMN + endpoint_role: Literal["subject", "object"] | None = None + endpoint_target_entity_qualified_name: str | None = None + endpoint_cardinality: Literal["one", "many"] | None = None + endpoint_nullable: bool | None = None + materialized_as_edge_property: bool = True + + @model_validator(mode="after") + def _validate_kind_consistency(self) -> Self: + is_endpoint = self.property_kind is PropertyKind.ENDPOINT + if not is_endpoint and self.name in _RESERVED_ENDPOINT_NAMES: + raise ValueError( + f"property name {self.name!r} is reserved for synthesized endpoint " + f"properties; columnar adapters MUST NOT declare it" + ) + if is_endpoint: + missing = [ + f + for f, v in ( + ("endpoint_role", self.endpoint_role), + ("endpoint_target_entity_qualified_name", self.endpoint_target_entity_qualified_name), + ("endpoint_cardinality", self.endpoint_cardinality), + ("endpoint_nullable", self.endpoint_nullable), + ) + if v is None + ] + if missing: + raise ValueError( + f"ENDPOINT property requires endpoint fields: missing {missing}" + ) + return self + + diff --git a/src/sema/models/target/refs.py b/src/sema/models/target/refs.py new file mode 100644 index 0000000..8cccb3d --- /dev/null +++ b/src/sema/models/target/refs.py @@ -0,0 +1,42 @@ +"""Target-side typed references.""" + +from __future__ import annotations + +from enum import Enum +from typing import Annotated + +from pydantic import BaseModel, ConfigDict, Field, StringConstraints + +from sema.models.planner._enums import TargetArtifactKind + +QualifiedName = Annotated[ + str, + StringConstraints(min_length=1, pattern=r"^[A-Za-z][A-Za-z0-9_]*(\.[A-Za-z0-9_]+)+$"), +] + + +class TargetEntityRef(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + target_model_id: str = Field(min_length=1) + qualified_name: QualifiedName + kind: TargetArtifactKind + + +class TargetPropertyRef(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + entity_ref: TargetEntityRef + property_name: str = Field(min_length=1) + + +class VocabularySource(str, Enum): + INLINE = "INLINE" + EXTERNAL = "EXTERNAL" + + +class VocabularyRef(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + name: str = Field(min_length=1) + source: VocabularySource diff --git a/src/sema/models/target/term.py b/src/sema/models/target/term.py new file mode 100644 index 0000000..bb35fb7 --- /dev/null +++ b/src/sema/models/target/term.py @@ -0,0 +1,16 @@ +"""TargetTermDecl — adapter-supplied controlled-vocabulary term.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field + +from sema.models.target.refs import VocabularyRef + + +class TargetTermDecl(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + vocabulary: VocabularyRef + code: str = Field(min_length=1) + display: str = Field(min_length=1) + domain: str | None = None diff --git a/src/sema/models/target/vocab_binding.py b/src/sema/models/target/vocab_binding.py new file mode 100644 index 0000000..85d86ca --- /dev/null +++ b/src/sema/models/target/vocab_binding.py @@ -0,0 +1,20 @@ +"""VocabularyBindingDecl — adapter-declared property→vocabulary binding.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict, Field + +from sema.models.target.refs import TargetEntityRef, VocabularyRef + + +class VocabularyBindingDecl(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + entity_ref: TargetEntityRef + property_name: str = Field(min_length=1) + vocabulary: VocabularyRef + domain: str | None = None + require_standard: bool = False + allow_zero_default: bool = False + effective_date_ref: str | None = None + resolver_policy_ref: str | None = None diff --git a/src/sema/targets/__init__.py b/src/sema/targets/__init__.py new file mode 100644 index 0000000..de887ae --- /dev/null +++ b/src/sema/targets/__init__.py @@ -0,0 +1,52 @@ +"""Target ontology adapters. + +Declarative protocol for loading target-side ontology declarations into +the planner-contract graph. Mirrors `src/sema/connectors/` structurally +but emits DTOs instead of observed metadata. +""" + +from __future__ import annotations + +from sema.targets.base import ( + REQUIRED_METHODS, + TargetOntologyAdapter, + TargetOntologyAdapterMixin, +) +from sema.targets.exceptions import ( + AdapterContractError, + AdapterRegistryError, + AmbiguousAdapterError, + CardContentDriftError, + DanglingRefError, + EnrichmentStatusDivergenceError, + LoaderStageOrderError, + NoMatchingAdapterError, + OverlappingVersionRangeError, + UnknownAdapterError, +) +from sema.targets.registry import ( + discover_entry_points, + get, + list_registered, + register_target_adapter, +) + +__all__ = [ + "TargetOntologyAdapter", + "TargetOntologyAdapterMixin", + "REQUIRED_METHODS", + "register_target_adapter", + "get", + "list_registered", + "discover_entry_points", + "AdapterContractError", + "AdapterRegistryError", + "AmbiguousAdapterError", + "CardContentDriftError", + "DanglingRefError", + "EnrichmentStatusDivergenceError", + "LoaderStageOrderError", + "NoMatchingAdapterError", + "OverlappingVersionRangeError", + "UnknownAdapterError", +] diff --git a/src/sema/targets/adapters/__init__.py b/src/sema/targets/adapters/__init__.py new file mode 100644 index 0000000..44fa574 --- /dev/null +++ b/src/sema/targets/adapters/__init__.py @@ -0,0 +1,3 @@ +"""Concrete target ontology adapter implementations.""" + +from __future__ import annotations diff --git a/src/sema/targets/adapters/manifest.py b/src/sema/targets/adapters/manifest.py new file mode 100644 index 0000000..29158a6 --- /dev/null +++ b/src/sema/targets/adapters/manifest.py @@ -0,0 +1,157 @@ +"""ManifestTargetAdapter — first concrete adapter, target-model-agnostic.""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator +from pathlib import Path + +from sema.models.target.completeness import SemanticCompletenessAnnotations +from sema.models.target.context_card import TargetContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.refs import TargetEntityRef, TargetPropertyRef, VocabularyRef +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.adapters.manifest_models import ( + ManifestEntity, + ParsedManifest, +) +from sema.targets.adapters.manifest_parser import parse_manifest_file +from sema.targets.adapters.manifest_utils import ( + descriptor_completeness, + filter_inline_term_vocabularies, + inline_term_vocab_names, + supplied_card, + synthesized_card, + to_inline_terms, + to_obligation, + to_target_entity, + to_target_entity_ref, + to_vocabulary_bindings, +) +from sema.targets.registry import register_target_adapter + +MANIFEST_ADAPTER_ID = "manifest" +MANIFEST_REGISTRY_TARGET_MODEL_ID = "manifest" + + +class ManifestTargetAdapter: + """Loads target ontology declarations from a YAML/JSON manifest file.""" + + def __init__(self, manifest_path: Path) -> None: + self._manifest_path = Path(manifest_path) + self._parsed: ParsedManifest = parse_manifest_file(self._manifest_path) + self._target_model_id = self._parsed.descriptor.target_model_id + self._entities_by_qname: dict[str, ManifestEntity] = { + e.qualified_name: e for e in self._parsed.entities + } + + @property + def parsed_manifest(self) -> ParsedManifest: + return self._parsed + + def describe(self) -> TargetModelDescriptor: + completeness = self._descriptor_completeness() + return TargetModelDescriptor( + target_model_id=self._parsed.descriptor.target_model_id, + target_model_version=self._parsed.descriptor.target_model_version, + display_name=self._parsed.descriptor.display_name, + owner=self._parsed.descriptor.owner, + vocabulary_release=self._parsed.descriptor.vocabulary_release, + completeness=completeness, + ) + + def discover_entities(self) -> Iterable[TargetEntityRef]: + refs = [ + to_target_entity_ref(e, self._target_model_id) + for e in self._parsed.entities + ] + return sorted(refs, key=lambda r: r.qualified_name) + + def load_entity(self, ref: TargetEntityRef) -> TargetEntityDecl: + manifest_entity = self._lookup_entity(ref) + return to_target_entity( + self._parsed, + manifest_entity, + self._target_model_id, + self._descriptor_completeness(), + inline_term_vocab_names(self._parsed), + ) + + def load_obligation(self, ref: TargetEntityRef) -> TargetObligationDecl: + manifest_entity = self._lookup_entity(ref) + return to_obligation(manifest_entity) + + def load_vocabulary_bindings( + self, ref: TargetPropertyRef + ) -> Iterable[VocabularyBindingDecl]: + manifest_entity = self._entities_by_qname.get(ref.entity_ref.qualified_name) + if manifest_entity is None: + return [] + for prop in manifest_entity.properties: + if prop.name == ref.property_name: + return to_vocabulary_bindings( + self._parsed, manifest_entity, prop, self._target_model_id + ) + return [] + + def load_context_card(self, ref: TargetEntityRef) -> TargetContextCard: + manifest_entity = self._lookup_entity(ref) + if manifest_entity.context_card is not None: + return supplied_card(manifest_entity, self._target_model_id) + return synthesized_card(manifest_entity, self._target_model_id) + + def iter_terms(self, vocabulary_ref: VocabularyRef) -> Iterator[TargetTermDecl]: + if not filter_inline_term_vocabularies(self._parsed, vocabulary_ref.name): + raise NotImplementedError( + f"manifest adapter does not inline terms for vocabulary " + f"{vocabulary_ref.name!r}; treated as EXTERNAL" + ) + return iter(to_inline_terms(self._parsed, vocabulary_ref.name)) + + def _descriptor_completeness(self) -> SemanticCompletenessAnnotations: + return descriptor_completeness(self._parsed) + + def _lookup_entity(self, ref: TargetEntityRef) -> ManifestEntity: + manifest_entity = self._entities_by_qname.get(ref.qualified_name) + if manifest_entity is None: + raise KeyError( + f"manifest does not declare entity {ref.qualified_name!r}; " + f"declared={sorted(self._entities_by_qname)}" + ) + return manifest_entity + + +def register_manifest_adapter() -> None: + """Register `ManifestTargetAdapter` under the manifest sentinel. + + The manifest adapter is target-model-agnostic: any manifest file's + descriptor supplies the actual `target_model_id` at runtime. The + registry entry uses a sentinel `target_model_id` so a single class + serves every manifest. Idempotent under the registry's overlap rule + because the supported_versions wildcard re-registration would + overlap; this helper short-circuits if already registered. + """ + from sema.targets.registry import _REGISTRY + + key = (MANIFEST_ADAPTER_ID, MANIFEST_REGISTRY_TARGET_MODEL_ID) + if any(r.cls is ManifestTargetAdapter for r in _REGISTRY._by_key.get(key, ())): + return + register_target_adapter( + adapter_id=MANIFEST_ADAPTER_ID, + target_model_id=MANIFEST_REGISTRY_TARGET_MODEL_ID, + supported_versions="", + wildcard_target_model_id=True, + )(ManifestTargetAdapter) + + +register_manifest_adapter() + + +__all__ = [ + "ManifestTargetAdapter", + "register_manifest_adapter", + "MANIFEST_ADAPTER_ID", + "MANIFEST_REGISTRY_TARGET_MODEL_ID", +] diff --git a/src/sema/targets/adapters/manifest_exceptions.py b/src/sema/targets/adapters/manifest_exceptions.py new file mode 100644 index 0000000..f58c7aa --- /dev/null +++ b/src/sema/targets/adapters/manifest_exceptions.py @@ -0,0 +1,35 @@ +"""Errors raised by the manifest parser.""" + +from __future__ import annotations + + +class ManifestError(Exception): + """Base class for manifest parsing errors.""" + + +class UnsupportedManifestVersionError(ManifestError): + pass + + +class UnsupportedManifestExtensionError(ManifestError): + pass + + +class ManifestSchemaError(ManifestError): + pass + + +class ManifestEndpointError(ManifestError): + pass + + +class ManifestContextCardError(ManifestError): + pass + + +class ManifestReservedNameError(ManifestError): + pass + + +class ManifestVocabularyError(ManifestError): + pass diff --git a/src/sema/targets/adapters/manifest_models.py b/src/sema/targets/adapters/manifest_models.py new file mode 100644 index 0000000..3a8f6ff --- /dev/null +++ b/src/sema/targets/adapters/manifest_models.py @@ -0,0 +1,161 @@ +"""Pydantic models mirroring the manifest schema (manifest_version=1).""" + +from __future__ import annotations + +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field + +from sema.models.planner._enums import ( + PrimaryKeyStrategy, + TargetArtifactKind, +) +from sema.models.target.completeness import SemanticCompletenessAnnotations +from sema.models.target.refs import VocabularySource + + +class ManifestVocabulary(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str = Field(min_length=1) + source: VocabularySource + + +class ManifestTerm(BaseModel): + model_config = ConfigDict(extra="forbid") + + vocabulary: str = Field(min_length=1) + code: str = Field(min_length=1) + display: str = Field(min_length=1) + domain: str | None = None + + +class ManifestVocabularyBinding(BaseModel): + model_config = ConfigDict(extra="forbid") + + vocabulary: str = Field(min_length=1) + domain: str | None = None + require_standard: bool = False + allow_zero_default: bool = False + effective_date_ref: str | None = None + resolver_policy_ref: str | None = None + + +class ManifestProperty(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str = Field(min_length=1) + type: str = Field(min_length=1) + nullable: bool = False + synonyms: list[str] = Field(default_factory=list) + decoded_values: dict[str, str] = Field(default_factory=dict) + vocabulary_binding: ManifestVocabularyBinding | None = None + + +class ManifestEndpoint(BaseModel): + model_config = ConfigDict(extra="forbid") + + target_entity: str = Field(min_length=1) + cardinality: Literal["one", "many"] = "one" + nullable: bool = False + + +class ManifestEndpoints(BaseModel): + model_config = ConfigDict(extra="forbid") + + subject: ManifestEndpoint + object: ManifestEndpoint + + +class ManifestForeignKey(BaseModel): + model_config = ConfigDict(extra="forbid") + + referenced_entity: str = Field(min_length=1) + join_keys: list[tuple[str, str]] = Field(min_length=1) + same_build_required: bool = True + + +class ManifestDomainConstraint(BaseModel): + model_config = ConfigDict(extra="forbid") + + property_name: str = Field(min_length=1) + domain_id: str = Field(min_length=1) + + +class ManifestRowClause(BaseModel): + model_config = ConfigDict(extra="forbid") + + kind: Literal["presence", "equality"] + field: str = Field(min_length=1) + value: Any | None = None + + +class ManifestRowPredicate(BaseModel): + model_config = ConfigDict(extra="forbid") + + op: Literal["AND", "OR"] + clauses: list[ManifestRowClause] = Field(min_length=1) + + +class ManifestExternalSequence(BaseModel): + model_config = ConfigDict(extra="forbid") + + mapping_table_name: str = Field(min_length=1) + canonical_identity_column: str = Field(min_length=1) + sequence_column: str = Field(min_length=1) + + +class ManifestObligation(BaseModel): + model_config = ConfigDict(extra="forbid") + + required_fields: list[str] = Field(min_length=1) + nullable_fields: list[str] = Field(default_factory=list) + primary_key: PrimaryKeyStrategy + external_sequence: ManifestExternalSequence | None = None + foreign_keys: list[ManifestForeignKey] = Field(default_factory=list) + domain_constraints: list[ManifestDomainConstraint] = Field(default_factory=list) + allowed_defaults: dict[str, Any] = Field(default_factory=dict) + minimum_viable_row: ManifestRowPredicate | None = None + + +class ManifestContextCard(BaseModel): + model_config = ConfigDict(extra="forbid") + + card_version: str = Field(min_length=1) + description: str = Field(min_length=1, max_length=4000) + examples: list[str] = Field(default_factory=list) + obligation_summary: str | None = None + curated_synonyms: list[str] = Field(default_factory=list) + + +class ManifestEntity(BaseModel): + model_config = ConfigDict(extra="forbid") + + qualified_name: str = Field(min_length=1) + kind: TargetArtifactKind + completeness: SemanticCompletenessAnnotations | None = None + endpoints: ManifestEndpoints | None = None + properties: list[ManifestProperty] = Field(default_factory=list) + obligation: ManifestObligation | None = None + context_card: ManifestContextCard | None = None + + +class ManifestDescriptor(BaseModel): + model_config = ConfigDict(extra="forbid") + + target_model_id: str = Field(min_length=1) + target_model_version: str = Field(min_length=1) + display_name: str = Field(min_length=1) + owner: str | None = None + vocabulary_release: str | None = None + completeness: SemanticCompletenessAnnotations | None = None + + +class ParsedManifest(BaseModel): + model_config = ConfigDict(extra="forbid") + + manifest_version: int + descriptor: ManifestDescriptor + vocabularies: list[ManifestVocabulary] = Field(default_factory=list) + terms: list[ManifestTerm] = Field(default_factory=list) + entities: list[ManifestEntity] = Field(default_factory=list) diff --git a/src/sema/targets/adapters/manifest_parser.py b/src/sema/targets/adapters/manifest_parser.py new file mode 100644 index 0000000..be4a47f --- /dev/null +++ b/src/sema/targets/adapters/manifest_parser.py @@ -0,0 +1,149 @@ +"""Manifest file → ParsedManifest with file-format and structural validation.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import yaml +from pydantic import ValidationError + +from sema.models.planner._enums import TargetArtifactKind +from sema.targets.adapters.manifest_exceptions import ( + ManifestContextCardError, + ManifestEndpointError, + ManifestReservedNameError, + ManifestSchemaError, + UnsupportedManifestExtensionError, + UnsupportedManifestVersionError, +) +from sema.targets.adapters.manifest_models import ( + ManifestEntity, + ParsedManifest, +) + +SUPPORTED_MANIFEST_VERSIONS: frozenset[int] = frozenset({1}) +RESERVED_ENDPOINT_PROPERTY_NAMES: frozenset[str] = frozenset({"subject", "object"}) + + +def parse_manifest_file(path: Path) -> ParsedManifest: + raw = _load_raw(path) + return parse_manifest_raw(raw) + + +def parse_manifest_raw(raw: dict[str, Any]) -> ParsedManifest: + _reject_card_hash_anywhere(raw) + version = raw.get("manifest_version") + if version not in SUPPORTED_MANIFEST_VERSIONS: + raise UnsupportedManifestVersionError( + f"manifest_version={version!r} is not supported; " + f"supported={sorted(SUPPORTED_MANIFEST_VERSIONS)}" + ) + try: + parsed = ParsedManifest.model_validate(raw) + except ValidationError as exc: + raise ManifestSchemaError(str(exc)) from exc + _validate_entities(parsed) + _validate_endpoint_kinds(parsed) + return parsed + + +def _load_raw(path: Path) -> dict[str, Any]: + suffix = path.suffix.lower() + text = path.read_text() + if suffix in (".yaml", ".yml"): + loaded = yaml.safe_load(text) + elif suffix == ".json": + loaded = json.loads(text) + else: + raise UnsupportedManifestExtensionError( + f"manifest file extension {suffix!r} is not supported; " + f"use .yaml, .yml, or .json" + ) + if not isinstance(loaded, dict): + raise ManifestSchemaError( + f"manifest root must be a mapping; got {type(loaded).__name__}" + ) + return loaded + + +def _reject_card_hash_anywhere(raw: Any, path: str = "$") -> None: + if isinstance(raw, dict): + if "card_hash" in raw: + raise ManifestContextCardError( + f"manifest at {path} carries forbidden field 'card_hash'; " + f"card_hash is computed by Sema" + ) + for key, value in raw.items(): + _reject_card_hash_anywhere(value, f"{path}.{key}") + elif isinstance(raw, list): + for i, item in enumerate(raw): + _reject_card_hash_anywhere(item, f"{path}[{i}]") + + +def _validate_entities(parsed: ParsedManifest) -> None: + for entity in parsed.entities: + _validate_reserved_names(entity) + _validate_endpoints_kind_invariant(entity) + _validate_context_card(entity) + + +def _validate_reserved_names(entity: ManifestEntity) -> None: + for prop in entity.properties: + if prop.name in RESERVED_ENDPOINT_PROPERTY_NAMES: + raise ManifestReservedNameError( + f"entity {entity.qualified_name!r} declares property {prop.name!r}; " + f"'subject' and 'object' are reserved for synthesized endpoint properties" + ) + + +def _validate_endpoints_kind_invariant(entity: ManifestEntity) -> None: + if entity.kind is TargetArtifactKind.GRAPH_EDGE: + if entity.endpoints is None: + raise ManifestEndpointError( + f"entity {entity.qualified_name!r} kind=GRAPH_EDGE requires endpoints" + ) + else: + if entity.endpoints is not None: + raise ManifestEndpointError( + f"entity {entity.qualified_name!r} kind={entity.kind.value} " + f"forbids endpoints" + ) + + +def _validate_context_card(entity: ManifestEntity) -> None: + if entity.context_card is None: + return + if not entity.context_card.description.strip(): + raise ManifestContextCardError( + f"entity {entity.qualified_name!r} context_card has empty description" + ) + + +def _validate_endpoint_kinds(parsed: ParsedManifest) -> None: + by_qname = {e.qualified_name: e for e in parsed.entities} + for entity in parsed.entities: + if entity.endpoints is None: + continue + for role, endpoint in ( + ("subject", entity.endpoints.subject), + ("object", entity.endpoints.object), + ): + target = by_qname.get(endpoint.target_entity) + if target is None: + continue + if target.kind is TargetArtifactKind.TABLE_ROW: + raise ManifestEndpointError( + f"entity {entity.qualified_name!r} endpoint {role!r} targets " + f"{endpoint.target_entity!r} of kind=TABLE_ROW; endpoints MUST " + f"reference GRAPH_NODE or GRAPH_EDGE entities" + ) + + +__all__ = [ + "parse_manifest_file", + "parse_manifest_raw", + "SUPPORTED_MANIFEST_VERSIONS", + "RESERVED_ENDPOINT_PROPERTY_NAMES", +] diff --git a/src/sema/targets/adapters/manifest_utils.py b/src/sema/targets/adapters/manifest_utils.py new file mode 100644 index 0000000..a4dc95f --- /dev/null +++ b/src/sema/targets/adapters/manifest_utils.py @@ -0,0 +1,366 @@ +"""Manifest → target DTO conversion helpers.""" + +from __future__ import annotations + +from sema.models.planner._enums import ( + PrimaryKeyStrategy, + TargetArtifactKind, +) +from sema.models.planner.target_model import ( + DomainConstraint, + ExternalSequenceMappingTable, + FieldEquality, + FieldPresence, + ForeignKeyObligation, + RowPredicate, +) +from sema.models.target.completeness import ( + SemanticCompleteness, + SemanticCompletenessAnnotations, +) +from sema.models.target.context_card import TargetContextCard +from sema.models.target.endpoints import EdgeEndpointDecl, EdgeEndpointsDecl +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import TargetPropertyDecl +from sema.models.target.refs import TargetEntityRef, VocabularyRef, VocabularySource +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.adapters.manifest_models import ( + ManifestEndpoint, + ManifestEntity, + ManifestObligation, + ManifestProperty, + ManifestRowClause, + ManifestRowPredicate, + ManifestTerm, + ManifestVocabulary, + ManifestVocabularyBinding, + ParsedManifest, +) + + +_DESCRIPTOR_DEFAULT_COMPLETENESS = SemanticCompletenessAnnotations( + structure=SemanticCompleteness.COMPLETE, + obligations=SemanticCompleteness.COMPLETE, + vocabulary_bindings=SemanticCompleteness.PARTIAL, + semantic_aliases=SemanticCompleteness.PARTIAL, + terms=SemanticCompleteness.EXTERNAL, +) + + +def descriptor_completeness(parsed: ParsedManifest) -> SemanticCompletenessAnnotations: + if parsed.descriptor.completeness is not None: + return parsed.descriptor.completeness + return _DESCRIPTOR_DEFAULT_COMPLETENESS + + +def vocab_source_index(parsed: ParsedManifest) -> dict[str, VocabularySource]: + return {v.name: v.source for v in parsed.vocabularies} + + +def inline_term_vocab_names(parsed: ParsedManifest) -> set[str]: + names: set[str] = set() + for term in parsed.terms: + names.add(term.vocabulary) + return names + + +def has_any_inline_term_for_entity_bindings( + entity: ManifestEntity, inline_vocab_names: set[str] +) -> bool: + for prop in entity.properties: + if prop.vocabulary_binding is None: + continue + if prop.vocabulary_binding.vocabulary in inline_vocab_names: + return True + return False + + +def effective_entity_completeness( + entity: ManifestEntity, + descriptor_default: SemanticCompletenessAnnotations, + inline_vocab_names: set[str], +) -> SemanticCompletenessAnnotations | None: + base = entity.completeness or descriptor_default + semantic_aliases = base.semantic_aliases + terms = base.terms + if any(prop.synonyms for prop in entity.properties): + semantic_aliases = SemanticCompleteness.COMPLETE + if has_any_inline_term_for_entity_bindings(entity, inline_vocab_names): + terms = SemanticCompleteness.COMPLETE + if entity.completeness is None and semantic_aliases is base.semantic_aliases and terms is base.terms: + return None + return SemanticCompletenessAnnotations( + structure=base.structure, + obligations=base.obligations, + vocabulary_bindings=base.vocabulary_bindings, + semantic_aliases=semantic_aliases, + terms=terms, + ) + + +def to_target_entity_ref(entity: ManifestEntity, target_model_id: str) -> TargetEntityRef: + return TargetEntityRef( + target_model_id=target_model_id, + qualified_name=entity.qualified_name, + kind=entity.kind, + ) + + +def to_target_property(prop: ManifestProperty) -> TargetPropertyDecl: + return TargetPropertyDecl( + name=prop.name, + type=prop.type, + nullable=prop.nullable, + synonyms=list(prop.synonyms), + decoded_values=dict(prop.decoded_values), + vocabulary_binding=( + prop.vocabulary_binding.vocabulary + if prop.vocabulary_binding is not None + else None + ), + ) + + +def to_endpoints_decl( + parsed: ParsedManifest, + entity: ManifestEntity, + target_model_id: str, +) -> EdgeEndpointsDecl | None: + if entity.endpoints is None: + return None + return EdgeEndpointsDecl( + subject=_to_endpoint_decl("subject", entity.endpoints.subject, parsed, target_model_id), + object=_to_endpoint_decl("object", entity.endpoints.object, parsed, target_model_id), + ) + + +def _to_endpoint_decl( + role: str, + endpoint: ManifestEndpoint, + parsed: ParsedManifest, + target_model_id: str, +) -> EdgeEndpointDecl: + target_kind = _resolve_endpoint_target_kind(endpoint.target_entity, parsed) + target_ref = TargetEntityRef( + target_model_id=target_model_id, + qualified_name=endpoint.target_entity, + kind=target_kind, + ) + return EdgeEndpointDecl( + role=role, # type: ignore[arg-type] + target_entity=target_ref, + cardinality=endpoint.cardinality, + nullable=endpoint.nullable, + ) + + +def _resolve_endpoint_target_kind( + qualified_name: str, parsed: ParsedManifest +) -> TargetArtifactKind: + for entity in parsed.entities: + if entity.qualified_name == qualified_name: + return entity.kind + return TargetArtifactKind.GRAPH_NODE + + +def to_target_entity( + parsed: ParsedManifest, + entity: ManifestEntity, + target_model_id: str, + descriptor_default: SemanticCompletenessAnnotations, + inline_vocab_names: set[str], +) -> TargetEntityDecl: + completeness = effective_entity_completeness( + entity, descriptor_default, inline_vocab_names + ) + return TargetEntityDecl( + ref=to_target_entity_ref(entity, target_model_id), + properties=[to_target_property(p) for p in entity.properties], + completeness=completeness, + endpoints=to_endpoints_decl(parsed, entity, target_model_id), + ) + + +def to_obligation( + entity: ManifestEntity, +) -> TargetObligationDecl: + raw = entity.obligation + if raw is None: + return _default_obligation_for_kind(entity) + minimum = _row_predicate(raw.minimum_viable_row) if raw.minimum_viable_row else _default_minimum(entity) + external = ( + ExternalSequenceMappingTable( + mapping_table_name=raw.external_sequence.mapping_table_name, + canonical_identity_column=raw.external_sequence.canonical_identity_column, + sequence_column=raw.external_sequence.sequence_column, + ) + if raw.external_sequence is not None + else None + ) + return TargetObligationDecl( + target_entity=entity.qualified_name, + required_fields=list(raw.required_fields), + nullable_fields=list(raw.nullable_fields), + primary_key=raw.primary_key, + external_sequence=external, + foreign_keys=[ + ForeignKeyObligation( + referenced_entity=fk.referenced_entity, + join_keys=list(fk.join_keys), + same_build_required=fk.same_build_required, + ) + for fk in raw.foreign_keys + ], + domain_constraints=[ + DomainConstraint(property_name=dc.property_name, domain_id=dc.domain_id) + for dc in raw.domain_constraints + ], + allowed_defaults=dict(raw.allowed_defaults), + minimum_viable_row=minimum, + ) + + +def _default_obligation_for_kind(entity: ManifestEntity) -> TargetObligationDecl: + if entity.kind is TargetArtifactKind.GRAPH_EDGE: + return TargetObligationDecl( + target_entity=entity.qualified_name, + required_fields=["subject", "object"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + minimum_viable_row=_default_minimum(entity), + ) + return TargetObligationDecl( + target_entity=entity.qualified_name, + required_fields=[entity.properties[0].name] if entity.properties else ["__placeholder__"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + ) + + +def _default_minimum(entity: ManifestEntity) -> RowPredicate | None: + if entity.kind is TargetArtifactKind.GRAPH_EDGE: + return RowPredicate( + op="AND", + clauses=[ + FieldPresence(field="subject"), + FieldPresence(field="object"), + ], + ) + return None + + +def _row_predicate(raw: ManifestRowPredicate) -> RowPredicate: + return RowPredicate( + op=raw.op, clauses=[_row_clause(c) for c in raw.clauses] + ) + + +def _row_clause(raw: ManifestRowClause) -> FieldPresence | FieldEquality: + if raw.kind == "presence": + return FieldPresence(field=raw.field) + return FieldEquality(field=raw.field, value=raw.value) + + +def to_vocabulary_bindings( + parsed: ParsedManifest, + entity: ManifestEntity, + prop: ManifestProperty, + target_model_id: str, +) -> list[VocabularyBindingDecl]: + if prop.vocabulary_binding is None: + return [] + binding = prop.vocabulary_binding + source = vocab_source_index(parsed).get(binding.vocabulary, VocabularySource.INLINE) + return [ + VocabularyBindingDecl( + entity_ref=to_target_entity_ref(entity, target_model_id), + property_name=prop.name, + vocabulary=VocabularyRef(name=binding.vocabulary, source=source), + domain=binding.domain, + require_standard=binding.require_standard, + allow_zero_default=binding.allow_zero_default, + effective_date_ref=binding.effective_date_ref, + resolver_policy_ref=binding.resolver_policy_ref, + ) + ] + + +def to_inline_terms( + parsed: ParsedManifest, vocabulary_name: str +) -> list[TargetTermDecl]: + source = vocab_source_index(parsed).get(vocabulary_name, VocabularySource.INLINE) + return [ + TargetTermDecl( + vocabulary=VocabularyRef(name=t.vocabulary, source=source), + code=t.code, + display=t.display, + domain=t.domain, + ) + for t in parsed.terms + if t.vocabulary == vocabulary_name + ] + + +def synthesized_card( + entity: ManifestEntity, target_model_id: str +) -> TargetContextCard: + obligation_summary = ( + f"{entity.kind.value} {entity.qualified_name}" + if entity.obligation is None + else f"required_fields={entity.obligation.required_fields}" + ) + return TargetContextCard( + entity_ref=to_target_entity_ref(entity, target_model_id), + card_version="0.0.0+synthesized", + description=f"Auto-generated card for {entity.qualified_name}: {obligation_summary}.", + examples=[], + obligation_summary=obligation_summary, + curated_synonyms=[], + ) + + +def supplied_card( + entity: ManifestEntity, target_model_id: str +) -> TargetContextCard: + raw = entity.context_card + assert raw is not None + return TargetContextCard( + entity_ref=to_target_entity_ref(entity, target_model_id), + card_version=raw.card_version, + description=raw.description, + examples=list(raw.examples), + obligation_summary=raw.obligation_summary, + curated_synonyms=list(raw.curated_synonyms), + ) + + +def filter_inline_term_vocabularies( + parsed: ParsedManifest, vocab_name: str +) -> list[ManifestTerm]: + return [t for t in parsed.terms if t.vocabulary == vocab_name] + + +def filter_vocabularies(parsed: ParsedManifest, name: str) -> ManifestVocabulary | None: + for vocab in parsed.vocabularies: + if vocab.name == name: + return vocab + return None + + +__all__ = [ + "descriptor_completeness", + "vocab_source_index", + "inline_term_vocab_names", + "to_target_entity_ref", + "to_target_property", + "to_endpoints_decl", + "to_target_entity", + "to_obligation", + "to_vocabulary_bindings", + "to_inline_terms", + "synthesized_card", + "supplied_card", + "filter_inline_term_vocabularies", + "filter_vocabularies", + "ManifestVocabularyBinding", +] diff --git a/src/sema/targets/base.py b/src/sema/targets/base.py new file mode 100644 index 0000000..1f75385 --- /dev/null +++ b/src/sema/targets/base.py @@ -0,0 +1,58 @@ +"""TargetOntologyAdapter protocol surface.""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator +from typing import Protocol, runtime_checkable + +from sema.models.target.context_card import TargetContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.refs import TargetEntityRef, TargetPropertyRef, VocabularyRef +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl + + +@runtime_checkable +class TargetOntologyAdapter(Protocol): + """Declarative loader for a target ontology. + + Adapters emit DTOs; they MUST NOT call the matching engine, planner, + or constraint layer, MUST NOT call any LLM, and MUST NOT mutate the + graph. Snapshot hashing is owned by `SnapshotHasher`; adapters MUST + NOT compute or return snapshot hashes. + """ + + def describe(self) -> TargetModelDescriptor: ... + + def discover_entities(self) -> Iterable[TargetEntityRef]: ... + + def load_entity(self, ref: TargetEntityRef) -> TargetEntityDecl: ... + + def load_obligation(self, ref: TargetEntityRef) -> TargetObligationDecl: ... + + def load_vocabulary_bindings( + self, ref: TargetPropertyRef + ) -> Iterable[VocabularyBindingDecl]: ... + + def load_context_card(self, ref: TargetEntityRef) -> TargetContextCard: ... + + def iter_terms(self, vocabulary_ref: VocabularyRef) -> Iterator[TargetTermDecl]: ... + + +class TargetOntologyAdapterMixin: + """Default-implementation mixin for adapters that do not inline terms.""" + + def iter_terms(self, vocabulary_ref: VocabularyRef) -> Iterator[TargetTermDecl]: + raise NotImplementedError("EXTERNAL terms; adapter does not inline") + + +REQUIRED_METHODS: tuple[str, ...] = ( + "describe", + "discover_entities", + "load_entity", + "load_obligation", + "load_vocabulary_bindings", + "load_context_card", +) diff --git a/src/sema/targets/exceptions.py b/src/sema/targets/exceptions.py new file mode 100644 index 0000000..d4966ad --- /dev/null +++ b/src/sema/targets/exceptions.py @@ -0,0 +1,43 @@ +"""Errors raised by the target adapter registry and loader pipeline.""" + +from __future__ import annotations + + +class AdapterRegistryError(Exception): + """Base error for the target adapter registry.""" + + +class UnknownAdapterError(AdapterRegistryError): + """Raised when registry.get cannot resolve adapter_id or target_model_id.""" + + +class AmbiguousAdapterError(AdapterRegistryError): + """Raised when registry.get matches more than one registration.""" + + +class NoMatchingAdapterError(AdapterRegistryError): + """Raised when registry.get finds no adapter for a requested version.""" + + +class OverlappingVersionRangeError(AdapterRegistryError): + """Raised when a registration overlaps an existing supported_versions range.""" + + +class AdapterContractError(Exception): + """Raised when an adapter violates its declarative contract.""" + + +class DanglingRefError(AdapterContractError): + """Raised by the normalizer when a DTO reference cannot be resolved.""" + + +class LoaderStageOrderError(Exception): + """Raised when the loader pipeline stages run out of order.""" + + +class CardContentDriftError(Exception): + """Raised when a card's content hash drifts under an unchanged card_version.""" + + +class EnrichmentStatusDivergenceError(Exception): + """Raised when compact and structured enrichment statuses disagree.""" diff --git a/src/sema/targets/hashing.py b/src/sema/targets/hashing.py new file mode 100644 index 0000000..8b7a284 --- /dev/null +++ b/src/sema/targets/hashing.py @@ -0,0 +1,82 @@ +"""SnapshotHasher and card_hash computation. + +Owns `target_schema_snapshot_hash`. Adapters MUST NOT compute or return +snapshot hashes; the normalizer rejects DTOs carrying a literal field +named `snapshot_hash`. +""" + +from __future__ import annotations + +import hashlib + +from pydantic import BaseModel + +from sema.models.target.context_card import TargetContextCard +from sema.models.target.normalized import NormalizedTargetModel +from sema.targets.exceptions import AdapterContractError +from sema.targets.hashing_utils import canonical_dumps + + +_DESCRIPTOR_NON_SCHEMA_FIELDS: tuple[str, ...] = ("display_name", "owner") + + +def _project_descriptor(descriptor: BaseModel) -> dict[str, object]: + raw = descriptor.model_dump(mode="json") + for field in _DESCRIPTOR_NON_SCHEMA_FIELDS: + raw.pop(field, None) + return raw + + +def _project_schema_bearing(model: NormalizedTargetModel) -> dict[str, object]: + return { + "descriptor": _project_descriptor(model.descriptor), + "entities": [e.model_dump(mode="json") for e in model.entities], + "obligations": [o.model_dump(mode="json") for o in model.obligations], + "vocabularies": [v.model_dump(mode="json") for v in model.vocabularies], + "vocabulary_bindings": [ + b.model_dump(mode="json") for b in model.vocabulary_bindings + ], + "terms": [t.model_dump(mode="json") for t in model.terms], + } + + +def _sha256_hex(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _scan_for_snapshot_hash_field(value: object, path: str = "$") -> None: + if isinstance(value, dict): + if "snapshot_hash" in value: + raise AdapterContractError( + f"DTO at {path} carries forbidden field 'snapshot_hash'; " + f"snapshot hashing is owned by SnapshotHasher" + ) + for key, item in value.items(): + _scan_for_snapshot_hash_field(item, f"{path}.{key}") + return + if isinstance(value, list): + for i, item in enumerate(value): + _scan_for_snapshot_hash_field(item, f"{path}[{i}]") + + +class SnapshotHasher: + """Deterministic SHA-256 over the schema-bearing projection of a model.""" + + @staticmethod + def hash(model: NormalizedTargetModel) -> str: + projection = _project_schema_bearing(model) + _scan_for_snapshot_hash_field(projection) + return _sha256_hex(canonical_dumps(projection)) + + +_CARD_CONTENT_FIELDS: tuple[str, ...] = ( + "description", + "examples", + "obligation_summary", + "curated_synonyms", +) + + +def compute_card_hash(card: TargetContextCard) -> str: + payload = {field: getattr(card, field) for field in _CARD_CONTENT_FIELDS} + return _sha256_hex(canonical_dumps(payload)) diff --git a/src/sema/targets/hashing_utils.py b/src/sema/targets/hashing_utils.py new file mode 100644 index 0000000..e06f02a --- /dev/null +++ b/src/sema/targets/hashing_utils.py @@ -0,0 +1,57 @@ +"""Canonical-JSON serialization helpers used by SnapshotHasher and card hashing. + +The canonical form sorts keys, omits whitespace separators, renders floats via +`repr()` against IEEE-754, and renders datetimes as ISO 8601 with a `Z` suffix. +""" + +from __future__ import annotations + +import json +from datetime import date, datetime +from typing import Any + + +def normalize_for_canonical_json(value: Any) -> Any: + if isinstance(value, datetime): + return _iso_with_z(value) + if isinstance(value, date): + return value.isoformat() + if isinstance(value, dict): + return {str(k): normalize_for_canonical_json(v) for k, v in value.items()} + if isinstance(value, (list, tuple)): + return [normalize_for_canonical_json(item) for item in value] + if isinstance(value, float): + return _CanonicalFloat(value) + return value + + +def _iso_with_z(value: datetime) -> str: + iso = value.isoformat() + if value.tzinfo is None: + return iso + "Z" + if iso.endswith("+00:00"): + return iso[: -len("+00:00")] + "Z" + return iso + + +class _CanonicalFloat(float): + """Float subclass that renders via `repr()` for canonical-JSON stability.""" + + def __repr__(self) -> str: + return repr(float(self)) + + +def _default(obj: Any) -> Any: # pragma: no cover - dispatched only for non-stdlib types + raise TypeError(f"Type {type(obj).__name__} is not JSON-serializable") + + +def canonical_dumps(value: Any) -> str: + normalized = normalize_for_canonical_json(value) + return json.dumps( + normalized, + sort_keys=True, + separators=(",", ":"), + ensure_ascii=False, + default=_default, + allow_nan=False, + ) diff --git a/src/sema/targets/loader.py b/src/sema/targets/loader.py new file mode 100644 index 0000000..2a4851d --- /dev/null +++ b/src/sema/targets/loader.py @@ -0,0 +1,150 @@ +"""load_target — orchestrates normalize → hash → materialize.""" + +from __future__ import annotations + +from collections.abc import Callable, Iterable +from datetime import datetime, timezone + +from sema.models.target.context_card import LoadedContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.enrichment import EnrichmentDecisionRecord +from sema.models.target.loaded import LoadedTarget +from sema.models.target.normalized import NormalizedTargetModel +from sema.models.target.refs import TargetEntityRef, VocabularyRef, VocabularySource +from sema.targets.base import TargetOntologyAdapter +from sema.targets.hashing import SnapshotHasher +from sema.targets.loader_utils import ( + aggregate_context_card_version, + card_versions_dict, + derive_enrichment_record, + populate_card_hashes, +) +from sema.targets.materializer import ( + GraphWriter, + StageGuard, + TargetModelMaterializer, +) +from sema.targets.normalizer import TargetModelNormalizer + +StageSpy = Callable[[str], None] + + +def _utcnow() -> datetime: + return datetime.now(tz=timezone.utc) + + +def load_target( + adapter: TargetOntologyAdapter, + *, + writer: GraphWriter, + selected_refs: Iterable[TargetEntityRef] | None = None, + skip_facets: Iterable[str] = (), + persisted_card_hashes: dict[tuple[str, str, str], str] | None = None, + stage_spy: StageSpy | None = None, +) -> LoadedTarget: + spy = stage_spy or (lambda _: None) + guard = StageGuard() + + spy("normalize_started") + normalized = TargetModelNormalizer.normalize(adapter, selected_refs) + guard.transition_to(StageGuard.NORMALIZED) + spy("normalize_completed") + + spy("hash_started") + target_schema_snapshot_hash = SnapshotHasher.hash(normalized) + guard.transition_to(StageGuard.HASHED) + spy("hash_completed") + + iter_terms_external = _detect_iter_terms_external_vocabs(adapter, normalized) + decisions = _derive_decisions(normalized, frozenset(skip_facets), iter_terms_external) + + cards_with_hash = populate_card_hashes( + normalized.context_cards, persisted_card_hashes + ) + + spy("materialize_started") + TargetModelMaterializer.write( + normalized, + target_schema_snapshot_hash, + writer, + decisions, + cards_with_hash=cards_with_hash, + stage_guard=guard, + ) + spy("materialize_completed") + + return _build_loaded_target( + normalized.descriptor, normalized, target_schema_snapshot_hash, decisions, cards_with_hash + ) + + +def _derive_decisions( + normalized: NormalizedTargetModel, + skip_facets: frozenset[str], + iter_terms_external_vocabs: frozenset[str], +) -> list[EnrichmentDecisionRecord]: + return [ + derive_enrichment_record( + entity=entity, + descriptor=normalized.descriptor, + bindings=normalized.vocabulary_bindings, + terms=normalized.terms, + skip_facets=skip_facets, + iter_terms_external_vocabs=iter_terms_external_vocabs, + ) + for entity in normalized.entities + ] + + +def _detect_iter_terms_external_vocabs( + adapter: TargetOntologyAdapter, normalized: NormalizedTargetModel +) -> frozenset[str]: + vocab_names: set[str] = set() + seen: set[str] = set() + for binding in normalized.vocabulary_bindings: + vocab = binding.vocabulary + if vocab.source is VocabularySource.EXTERNAL: + continue + if vocab.name in seen: + continue + seen.add(vocab.name) + if _adapter_treats_vocab_as_external(adapter, vocab): + vocab_names.add(vocab.name) + return frozenset(vocab_names) + + +def _adapter_treats_vocab_as_external( + adapter: TargetOntologyAdapter, vocab: VocabularyRef +) -> bool: + iter_terms = getattr(adapter, "iter_terms", None) + if iter_terms is None: + return True + try: + next(iter(iter_terms(vocab)), None) + except NotImplementedError: + return True + return False + + +def _build_loaded_target( + descriptor: TargetModelDescriptor, + normalized: NormalizedTargetModel, + target_schema_snapshot_hash: str, + decisions: list[EnrichmentDecisionRecord], + cards_with_hash: list[LoadedContextCard], +) -> LoadedTarget: + card_hashes = {c.entity_ref.qualified_name: c.card_hash for c in cards_with_hash} + return LoadedTarget( + descriptor=descriptor, + target_schema_snapshot_hash=target_schema_snapshot_hash, + entity_refs=[e.ref for e in normalized.entities], + enrichment_decisions=decisions, + card_versions=card_versions_dict(cards_with_hash), + aggregate_context_card_version=aggregate_context_card_version(cards_with_hash), + context_cards=cards_with_hash, + card_hashes=card_hashes, + materialized_at=_utcnow(), + ) + + +__all__ = ["load_target"] diff --git a/src/sema/targets/loader_utils.py b/src/sema/targets/loader_utils.py new file mode 100644 index 0000000..7e52d22 --- /dev/null +++ b/src/sema/targets/loader_utils.py @@ -0,0 +1,190 @@ +"""Helpers for `load_target`: enrichment decisions, card hashes, aggregate version.""" + +from __future__ import annotations + +import hashlib +from collections.abc import Iterable +from datetime import datetime, timezone + +from sema.models.target.completeness import SemanticCompleteness +from sema.models.target.context_card import LoadedContextCard, TargetContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.enrichment import ( + EnrichmentDecisionRecord, + EnrichmentStatus, + Facet, + FacetDecision, +) +from sema.models.target.refs import VocabularySource +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.exceptions import CardContentDriftError +from sema.targets.hashing import compute_card_hash +from sema.targets.hashing_utils import canonical_dumps + + +def _utcnow() -> datetime: + return datetime.now(tz=timezone.utc) + + +def derive_facet_decision( + facet: Facet, + annotation: SemanticCompleteness, + annotation_source: str, + skip_facets: frozenset[str], + supplied_by_adapter: bool, + decided_at: datetime, +) -> FacetDecision: + if supplied_by_adapter: + return FacetDecision( + status=EnrichmentStatus.supplied_by_adapter, + reason=f"adapter inlined data for facet ({annotation_source})", + decided_at=decided_at, + ) + if annotation is SemanticCompleteness.COMPLETE: + return FacetDecision( + status=EnrichmentStatus.not_required, + reason="adapter declared facet COMPLETE", + decided_at=decided_at, + ) + if annotation is SemanticCompleteness.EXTERNAL: + return FacetDecision( + status=EnrichmentStatus.not_required, + reason="adapter declared facet EXTERNAL", + decided_at=decided_at, + ) + if facet.value in skip_facets: + return FacetDecision( + status=EnrichmentStatus.required_skipped, + reason=f"operator opt-out via build-config skip_facets={sorted(skip_facets)}", + decided_at=decided_at, + ) + return FacetDecision( + status=EnrichmentStatus.required_deferred, + reason=f"facet declared {annotation.value}; awaiting target-semantic-enrichment", + decided_at=decided_at, + ) + + +def effective_annotation( + facet: Facet, + descriptor: TargetModelDescriptor, + entity: TargetEntityDecl, +) -> tuple[SemanticCompleteness, str]: + if entity.completeness is not None: + return getattr(entity.completeness, facet.value), "entity-level annotation" + return getattr(descriptor.completeness, facet.value), "descriptor-level annotation" + + +def derive_supplied_by_adapter_flags( + entity: TargetEntityDecl, + bindings: list[VocabularyBindingDecl], + terms: list[TargetTermDecl], +) -> dict[Facet, bool]: + has_inline_synonyms = any(prop.synonyms for prop in entity.properties) + binding_vocabs = { + b.vocabulary.name + for b in bindings + if b.entity_ref.qualified_name == entity.ref.qualified_name + } + inline_term_vocabs = { + t.vocabulary.name for t in terms if t.vocabulary.source is VocabularySource.INLINE + } + has_inline_terms = bool(binding_vocabs & inline_term_vocabs) + return { + Facet.structure: False, + Facet.obligations: False, + Facet.vocabulary_bindings: False, + Facet.semantic_aliases: has_inline_synonyms, + Facet.terms: has_inline_terms, + } + + +def derive_enrichment_record( + entity: TargetEntityDecl, + descriptor: TargetModelDescriptor, + bindings: list[VocabularyBindingDecl], + terms: list[TargetTermDecl], + skip_facets: frozenset[str], + iter_terms_external_vocabs: frozenset[str], +) -> EnrichmentDecisionRecord: + decided_at = _utcnow() + supplied = derive_supplied_by_adapter_flags(entity, bindings, terms) + decisions: dict[Facet, FacetDecision] = {} + for facet in Facet: + ann, source = effective_annotation(facet, descriptor, entity) + ann, source = _apply_iter_terms_external( + facet, entity, bindings, iter_terms_external_vocabs, ann, source + ) + decisions[facet] = derive_facet_decision( + facet, ann, source, skip_facets, supplied[facet], decided_at + ) + return EnrichmentDecisionRecord( + entity_ref=entity.ref, decisions=decisions, decided_at=decided_at + ) + + +def _apply_iter_terms_external( + facet: Facet, + entity: TargetEntityDecl, + bindings: list[VocabularyBindingDecl], + iter_terms_external_vocabs: frozenset[str], + ann: SemanticCompleteness, + source: str, +) -> tuple[SemanticCompleteness, str]: + if facet is not Facet.terms: + return ann, source + entity_bindings = [ + b for b in bindings if b.entity_ref.qualified_name == entity.ref.qualified_name + ] + if not entity_bindings: + return ann, source + flagged = [b.vocabulary.name for b in entity_bindings if b.vocabulary.name in iter_terms_external_vocabs] + if flagged: + return ( + SemanticCompleteness.EXTERNAL, + f"adapter raised NotImplementedError for vocabularies {sorted(set(flagged))}; " + f"treated as EXTERNAL", + ) + return ann, source + + +def populate_card_hashes( + cards: Iterable[TargetContextCard], + persisted_hashes: dict[tuple[str, str, str], str] | None = None, +) -> list[LoadedContextCard]: + out: list[LoadedContextCard] = [] + persisted = persisted_hashes or {} + for card in cards: + digest = compute_card_hash(card) + key = ( + card.entity_ref.target_model_id, + card.entity_ref.qualified_name, + card.card_version, + ) + previous = persisted.get(key) + if previous is not None and previous != digest: + raise CardContentDriftError( + f"card content drift for target_model_id={key[0]!r} " + f"entity_ref={key[1]!r} card_version={key[2]!r}: " + f"previous_hash={previous!r} current_hash={digest!r}; bump card_version" + ) + out.append(LoadedContextCard.from_target_card(card, digest)) + return out + + +def aggregate_context_card_version(cards: list[LoadedContextCard]) -> str: + if len(cards) == 1: + return cards[0].card_version + pairs = sorted( + (c.entity_ref.qualified_name, c.card_version) for c in cards + ) + payload = canonical_dumps(pairs) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def card_versions_dict(cards: list[LoadedContextCard]) -> dict[str, str]: + return {c.entity_ref.qualified_name: c.card_version for c in cards} + + diff --git a/src/sema/targets/materializer.py b/src/sema/targets/materializer.py new file mode 100644 index 0000000..3d0c82f --- /dev/null +++ b/src/sema/targets/materializer.py @@ -0,0 +1,200 @@ +"""TargetModelMaterializer + GraphWriter Protocol + InMemoryGraphWriter. + +The materializer is the only `sema.targets` module permitted to import +from `sema.graph` (per the import-boundary rule). This module must not +import from `sema.engine` or `sema.pipeline`. +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +from sema.models.target.enrichment import EnrichmentDecisionRecord +from sema.models.target.normalized import NormalizedTargetModel +from sema.targets.exceptions import ( + EnrichmentStatusDivergenceError, + LoaderStageOrderError, +) +from sema.models.target.context_card import LoadedContextCard +from sema.targets.materializer_ops import ( + ConstraintOp, + ContextCardOp, + CurrentFlipOp, + EnrichmentDecisionOp, + EntityOp, + PropertyOp, + RelationshipOp, + TargetObligationOp, + TermOp, + VocabularyBindingOp, + WriteOp, +) +from sema.targets.materializer_utils import ( + build_current_flip_op, + write_constraints, + write_context_cards, + write_decision_node, + write_entity_and_properties, + write_obligation, + write_term, + write_vocabulary_bindings, +) + + +@runtime_checkable +class GraphWriter(Protocol): + def write_entity(self, op: EntityOp) -> None: ... + + def write_property(self, op: PropertyOp) -> None: ... + + def write_term(self, op: TermOp) -> None: ... + + def write_constraint(self, op: object) -> None: ... + + def write_target_obligation(self, op: TargetObligationOp) -> None: ... + + def write_enrichment_decision(self, op: EnrichmentDecisionOp) -> None: ... + + def write_relationship(self, op: RelationshipOp) -> None: ... + + def write_vocabulary_binding(self, op: VocabularyBindingOp) -> None: ... + + def write_context_card(self, op: ContextCardOp) -> None: ... + + def flip_prior_generations(self, op: CurrentFlipOp) -> None: ... + + +class InMemoryGraphWriter: + """Test double that records every write call as a typed `WriteOp`.""" + + def __init__(self) -> None: + self.ops: list[WriteOp] = [] + + def write_entity(self, op: EntityOp) -> None: + self.ops.append(op) + + def write_property(self, op: PropertyOp) -> None: + self.ops.append(op) + + def write_term(self, op: TermOp) -> None: + self.ops.append(op) + + def write_constraint(self, op: object) -> None: + if isinstance(op, ConstraintOp): + self.ops.append(op) + + def write_target_obligation(self, op: TargetObligationOp) -> None: + self.ops.append(op) + + def write_enrichment_decision(self, op: EnrichmentDecisionOp) -> None: + self.ops.append(op) + + def write_relationship(self, op: RelationshipOp) -> None: + self.ops.append(op) + + def write_vocabulary_binding(self, op: VocabularyBindingOp) -> None: + self.ops.append(op) + + def write_context_card(self, op: ContextCardOp) -> None: + self.ops.append(op) + + def flip_prior_generations(self, op: CurrentFlipOp) -> None: + self.ops.append(op) + + +class StageGuard: + """Tracks loader stage progression; materializer requires HASHED state.""" + + NORMALIZED = "normalized" + HASHED = "hashed" + MATERIALIZED = "materialized" + + def __init__(self) -> None: + self.state: str | None = None + + def transition_to(self, state: str) -> None: + order = (self.NORMALIZED, self.HASHED, self.MATERIALIZED) + prior_index = -1 if self.state is None else order.index(self.state) + new_index = order.index(state) + if new_index != prior_index + 1: + raise LoaderStageOrderError( + f"stage out of order: tried to enter {state!r} from {self.state!r}" + ) + self.state = state + + def require_at_least(self, state: str) -> None: + order = (self.NORMALIZED, self.HASHED, self.MATERIALIZED) + current = -1 if self.state is None else order.index(self.state) + target = order.index(state) + if current < target: + raise LoaderStageOrderError( + f"materializer requires stage >= {state!r}; current={self.state!r}" + ) + + +class TargetModelMaterializer: + @staticmethod + def write( + model: NormalizedTargetModel, + target_schema_snapshot_hash: str, + writer: GraphWriter, + enrichment_decisions: list[EnrichmentDecisionRecord], + cards_with_hash: list[LoadedContextCard] | None = None, + stage_guard: StageGuard | None = None, + ) -> list[EnrichmentDecisionRecord]: + if stage_guard is not None: + stage_guard.require_at_least(StageGuard.HASHED) + descriptor = model.descriptor + decisions_by_entity = { + r.entity_ref.qualified_name: r for r in enrichment_decisions + } + obligations_by_entity = {o.target_entity: o for o in model.obligations} + for entity in model.entities: + decision = decisions_by_entity.get(entity.ref.qualified_name) + if decision is None: + raise EnrichmentStatusDivergenceError( + f"entity {entity.ref.qualified_name!r} has no EnrichmentDecisionRecord" + ) + write_entity_and_properties( + writer, descriptor, entity, target_schema_snapshot_hash, decision + ) + write_constraints( + writer, + descriptor, + entity, + obligations_by_entity.get(entity.ref.qualified_name), + target_schema_snapshot_hash, + ) + write_decision_node( + writer, descriptor, entity, target_schema_snapshot_hash, decision + ) + for obligation in model.obligations: + write_obligation(writer, descriptor, obligation, target_schema_snapshot_hash) + for term in model.terms: + write_term(writer, descriptor, term, target_schema_snapshot_hash) + write_vocabulary_bindings( + writer, descriptor, model.vocabulary_bindings, target_schema_snapshot_hash + ) + write_context_cards( + writer, descriptor, cards_with_hash or [], target_schema_snapshot_hash + ) + writer.flip_prior_generations( + build_current_flip_op( + descriptor, + model, + target_schema_snapshot_hash, + bindings=model.vocabulary_bindings, + cards=cards_with_hash or [], + ) + ) + if stage_guard is not None: + stage_guard.transition_to(StageGuard.MATERIALIZED) + return enrichment_decisions + + +__all__ = [ + "GraphWriter", + "InMemoryGraphWriter", + "TargetModelMaterializer", + "StageGuard", +] diff --git a/src/sema/targets/materializer_binding_card_utils.py b/src/sema/targets/materializer_binding_card_utils.py new file mode 100644 index 0000000..227b9f4 --- /dev/null +++ b/src/sema/targets/materializer_binding_card_utils.py @@ -0,0 +1,140 @@ +"""Materializer helpers for VocabularyBinding + ContextCard writes. + +Kept separate from `materializer_utils.py` so each helper file stays +under the 400-line cap. +""" + +from __future__ import annotations + +from typing import Any, Protocol + +from sema.models.target.context_card import LoadedContextCard +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.materializer_ops import ( + ContextCardOp, + RelationshipOp, + VocabularyBindingOp, +) + + +class _WriterLike(Protocol): + def write_vocabulary_binding(self, op: VocabularyBindingOp) -> None: ... + def write_context_card(self, op: ContextCardOp) -> None: ... + def write_relationship(self, op: RelationshipOp) -> None: ... + + +def write_vocabulary_bindings( + writer: _WriterLike, + descriptor: Any, + bindings: list[VocabularyBindingDecl], + snapshot_hash: str, +) -> None: + for b in bindings: + writer.write_vocabulary_binding(_binding_op(descriptor, b, snapshot_hash)) + writer.write_relationship(_binding_rel(descriptor, b, snapshot_hash)) + + +def _binding_op( + descriptor: Any, + b: VocabularyBindingDecl, + snapshot_hash: str, +) -> VocabularyBindingOp: + return VocabularyBindingOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + parent_entity_qualified_name=b.entity_ref.qualified_name, + property_name=b.property_name, + vocabulary_name=b.vocabulary.name, + vocabulary_source=b.vocabulary.source.value, + domain=b.domain, + require_standard=b.require_standard, + allow_zero_default=b.allow_zero_default, + effective_date_ref=b.effective_date_ref, + resolver_policy_ref=b.resolver_policy_ref, + ) + + +def _binding_rel( + descriptor: Any, + b: VocabularyBindingDecl, + snapshot_hash: str, +) -> RelationshipOp: + versioned = _versioned_keys(descriptor, snapshot_hash) + return RelationshipOp( + rel_type="HAS_VOCABULARY_BINDING", + target_schema_snapshot_hash=snapshot_hash, + from_label="Property", + from_keys={ + **versioned, + "parent_entity_qualified_name": b.entity_ref.qualified_name, + "name": b.property_name, + }, + to_label="VocabularyBinding", + to_keys={ + **versioned, + "parent_entity_qualified_name": b.entity_ref.qualified_name, + "property_name": b.property_name, + "vocabulary_name": b.vocabulary.name, + }, + ) + + +def write_context_cards( + writer: _WriterLike, + descriptor: Any, + cards: list[LoadedContextCard], + snapshot_hash: str, +) -> None: + for card in cards: + writer.write_context_card(_card_op(descriptor, card, snapshot_hash)) + writer.write_relationship(_card_rel(descriptor, card, snapshot_hash)) + + +def _card_op( + descriptor: Any, card: LoadedContextCard, snapshot_hash: str +) -> ContextCardOp: + return ContextCardOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + entity_qualified_name=card.entity_ref.qualified_name, + card_version=card.card_version, + card_hash=card.card_hash, + description=card.description, + examples=list(card.examples), + obligation_summary=card.obligation_summary, + curated_synonyms=list(card.curated_synonyms), + ) + + +def _card_rel( + descriptor: Any, card: LoadedContextCard, snapshot_hash: str +) -> RelationshipOp: + versioned = _versioned_keys(descriptor, snapshot_hash) + return RelationshipOp( + rel_type="HAS_CONTEXT_CARD", + target_schema_snapshot_hash=snapshot_hash, + from_label="Entity", + from_keys={ + **versioned, + "qualified_name": card.entity_ref.qualified_name, + }, + to_label="ContextCard", + to_keys={ + **versioned, + "entity_qualified_name": card.entity_ref.qualified_name, + "card_version": card.card_version, + }, + ) + + +def _versioned_keys(descriptor: Any, snapshot_hash: str) -> dict[str, str]: + return { + "target_model_id": descriptor.target_model_id, + "target_model_version": descriptor.target_model_version, + "target_schema_snapshot_hash": snapshot_hash, + } + + +__all__ = ["write_vocabulary_bindings", "write_context_cards"] diff --git a/src/sema/targets/materializer_ops.py b/src/sema/targets/materializer_ops.py new file mode 100644 index 0000000..ef49589 --- /dev/null +++ b/src/sema/targets/materializer_ops.py @@ -0,0 +1,164 @@ +"""WriteOp dataclasses recorded by graph writers. + +Pydantic-frozen data shapes returned and inspected by tests. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field + + +class _Op(BaseModel): + model_config = ConfigDict(extra="forbid", frozen=True) + + +class EntityOp(_Op): + op: Literal["entity"] = "entity" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + qualified_name: str + kind: str + enrichment_status: dict[str, str] + is_current: bool = True + + +class PropertyOp(_Op): + op: Literal["property"] = "property" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + parent_entity_qualified_name: str + name: str + type: str + nullable: bool + synonyms: list[str] = Field(default_factory=list) + decoded_values: dict[str, str] = Field(default_factory=dict) + property_kind: str = "COLUMN" + endpoint_role: str | None = None + endpoint_target_entity_qualified_name: str | None = None + endpoint_cardinality: str | None = None + endpoint_nullable: bool | None = None + materialized_as_edge_property: bool = True + is_current: bool = True + + +class TermOp(_Op): + op: Literal["term"] = "term" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + vocabulary_name: str + code: str + display: str + + +class ConstraintOp(_Op): + op: Literal["constraint"] = "constraint" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + attached_property_id: str + constraint_kind: str + payload: dict[str, Any] = Field(default_factory=dict) + payload_hash: str + is_current: bool = True + + +class TargetObligationOp(_Op): + op: Literal["target_obligation"] = "target_obligation" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + target_entity: str + payload: dict[str, Any] + + +class EnrichmentDecisionOp(_Op): + op: Literal["enrichment_decision"] = "enrichment_decision" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + entity_ref: str + decisions_json: str + decided_at: datetime + is_current: bool = True + + +class RelationshipOp(_Op): + op: Literal["relationship"] = "relationship" + rel_type: str + target_schema_snapshot_hash: str + from_label: str + from_keys: dict[str, str] + to_label: str + to_keys: dict[str, str] + + +class VocabularyBindingOp(_Op): + op: Literal["vocabulary_binding"] = "vocabulary_binding" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + parent_entity_qualified_name: str + property_name: str + vocabulary_name: str + vocabulary_source: str + domain: str | None = None + require_standard: bool = False + allow_zero_default: bool = False + effective_date_ref: str | None = None + resolver_policy_ref: str | None = None + is_current: bool = True + + +class ContextCardOp(_Op): + op: Literal["context_card"] = "context_card" + target_model_id: str + target_model_version: str + target_schema_snapshot_hash: str + entity_qualified_name: str + card_version: str + card_hash: str + description: str + examples: list[str] = Field(default_factory=list) + obligation_summary: str | None = None + curated_synonyms: list[str] = Field(default_factory=list) + is_current: bool = True + + +class CurrentFlipOp(_Op): + """Records logical-artifact identity tuples whose prior generations + should have `is_current=false` after this load. Scoped strictly to + artifacts touched by the current load (lazy-load preservation).""" + + op: Literal["current_flip"] = "current_flip" + target_model_id: str + target_model_version: str + current_snapshot_hash: str + entity_qualified_names: tuple[str, ...] + property_keys: tuple[tuple[str, str], ...] = Field(default_factory=tuple) + obligation_target_entities: tuple[str, ...] = Field(default_factory=tuple) + enrichment_entity_refs: tuple[str, ...] = Field(default_factory=tuple) + vocabulary_binding_keys: tuple[tuple[str, str, str], ...] = Field( + default_factory=tuple + ) + context_card_keys: tuple[tuple[str, str], ...] = Field(default_factory=tuple) + term_keys: tuple[tuple[str, str], ...] = Field(default_factory=tuple) + + +WriteOp = ( + EntityOp + | PropertyOp + | TermOp + | ConstraintOp + | TargetObligationOp + | EnrichmentDecisionOp + | VocabularyBindingOp + | ContextCardOp + | RelationshipOp + | CurrentFlipOp +) diff --git a/src/sema/targets/materializer_utils.py b/src/sema/targets/materializer_utils.py new file mode 100644 index 0000000..bfe1504 --- /dev/null +++ b/src/sema/targets/materializer_utils.py @@ -0,0 +1,340 @@ +"""Helpers for `TargetModelMaterializer`. + +Keeps `materializer.py` thin (it must stay under the 400-line cap and +remains the only `sema.targets` module permitted to import `sema.graph`, +so non-graph helpers live here). +""" + +from __future__ import annotations + +import hashlib +from typing import Any, Protocol + +from sema.models.planner.target_model import DomainConstraint, ForeignKeyObligation +from sema.models.target.context_card import LoadedContextCard +from sema.models.target.enrichment import EnrichmentDecisionRecord +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.normalized import NormalizedTargetModel +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import PropertyKind, TargetPropertyDecl +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.hashing_utils import canonical_dumps +from sema.targets.materializer_ops import ( + ConstraintOp, + CurrentFlipOp, + EnrichmentDecisionOp, + EntityOp, + PropertyOp, + RelationshipOp, + TargetObligationOp, + TermOp, +) + + +class _WriterLike(Protocol): + def write_entity(self, op: EntityOp) -> None: ... + def write_property(self, op: PropertyOp) -> None: ... + def write_term(self, op: TermOp) -> None: ... + def write_constraint(self, op: object) -> None: ... + def write_target_obligation(self, op: TargetObligationOp) -> None: ... + def write_enrichment_decision(self, op: EnrichmentDecisionOp) -> None: ... + def write_relationship(self, op: RelationshipOp) -> None: ... + + +def constraints_for_obligation( + descriptor: Any, + entity: TargetEntityDecl, + obligation: TargetObligationDecl, + snapshot_hash: str, +) -> list[ConstraintOp]: + columnar = { + p.name for p in entity.properties if p.property_kind is PropertyKind.COLUMN + } + return [ + _domain_constraint_op(descriptor, entity, dc, snapshot_hash) + for dc in obligation.domain_constraints + if dc.property_name in columnar + ] + + +def _domain_constraint_op( + descriptor: Any, + entity: TargetEntityDecl, + dc: DomainConstraint, + snapshot_hash: str, +) -> ConstraintOp: + payload = {"domain_id": dc.domain_id} + payload_hash = _payload_hash(payload) + return ConstraintOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + attached_property_id=f"{entity.ref.qualified_name}.{dc.property_name}", + constraint_kind="domain_binding", + payload=payload, + payload_hash=payload_hash, + ) + + +def _payload_hash(payload: dict[str, Any]) -> str: + return hashlib.sha256(canonical_dumps(payload).encode("utf-8")).hexdigest() + + +def write_entity_and_properties( + writer: _WriterLike, + descriptor: Any, + entity: TargetEntityDecl, + snapshot_hash: str, + decision: EnrichmentDecisionRecord, +) -> None: + enrichment_status = { + facet.value: fd.status.value for facet, fd in decision.decisions.items() + } + writer.write_entity( + EntityOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + qualified_name=entity.ref.qualified_name, + kind=entity.ref.kind.value, + enrichment_status=enrichment_status, + ) + ) + for prop in entity.properties: + writer.write_property(_property_op(descriptor, entity, prop, snapshot_hash)) + writer.write_relationship( + _has_property_rel(descriptor, entity, prop, snapshot_hash) + ) + + +def _property_op( + descriptor: Any, + entity: TargetEntityDecl, + prop: TargetPropertyDecl, + snapshot_hash: str, +) -> PropertyOp: + is_endpoint = prop.property_kind is PropertyKind.ENDPOINT + return PropertyOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + parent_entity_qualified_name=entity.ref.qualified_name, + name=prop.name, + type=prop.type, + nullable=prop.nullable, + synonyms=list(prop.synonyms), + decoded_values=dict(prop.decoded_values), + property_kind=prop.property_kind.value, + endpoint_role=prop.endpoint_role, + endpoint_target_entity_qualified_name=prop.endpoint_target_entity_qualified_name, + endpoint_cardinality=prop.endpoint_cardinality, + endpoint_nullable=prop.endpoint_nullable, + materialized_as_edge_property=False if is_endpoint else prop.materialized_as_edge_property, + ) + + +def _has_property_rel( + descriptor: Any, + entity: TargetEntityDecl, + prop: TargetPropertyDecl, + snapshot_hash: str, +) -> RelationshipOp: + return RelationshipOp( + rel_type="HAS_PROPERTY", + target_schema_snapshot_hash=snapshot_hash, + from_label="Entity", + from_keys={ + "target_model_id": descriptor.target_model_id, + "target_model_version": descriptor.target_model_version, + "target_schema_snapshot_hash": snapshot_hash, + "qualified_name": entity.ref.qualified_name, + }, + to_label="Property", + to_keys={ + "target_model_id": descriptor.target_model_id, + "target_model_version": descriptor.target_model_version, + "target_schema_snapshot_hash": snapshot_hash, + "parent_entity_qualified_name": entity.ref.qualified_name, + "name": prop.name, + }, + ) + + +def write_decision_node( + writer: _WriterLike, + descriptor: Any, + entity: TargetEntityDecl, + snapshot_hash: str, + decision: EnrichmentDecisionRecord, +) -> None: + decisions_json = canonical_dumps( + { + facet.value: { + "status": fd.status.value, + "reason": fd.reason, + "decided_at": fd.decided_at.isoformat(), + } + for facet, fd in decision.decisions.items() + } + ) + writer.write_enrichment_decision( + EnrichmentDecisionOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + entity_ref=entity.ref.qualified_name, + decisions_json=decisions_json, + decided_at=decision.decided_at, + ) + ) + writer.write_relationship( + RelationshipOp( + rel_type="HAS_ENRICHMENT_DECISION", + target_schema_snapshot_hash=snapshot_hash, + from_label="Entity", + from_keys=_entity_keys(descriptor, entity.ref.qualified_name, snapshot_hash), + to_label="EnrichmentDecision", + to_keys={ + **_versioned_keys(descriptor, snapshot_hash), + "entity_ref": entity.ref.qualified_name, + }, + ) + ) + + +def write_obligation( + writer: _WriterLike, + descriptor: Any, + obligation: TargetObligationDecl, + snapshot_hash: str, +) -> None: + payload = obligation.model_dump(mode="json") + payload["foreign_keys"] = [_fk_payload(fk) for fk in obligation.foreign_keys] + writer.write_target_obligation( + TargetObligationOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + target_entity=obligation.target_entity, + payload=payload, + ) + ) + writer.write_relationship( + RelationshipOp( + rel_type="HAS_OBLIGATION", + target_schema_snapshot_hash=snapshot_hash, + from_label="Entity", + from_keys=_entity_keys(descriptor, obligation.target_entity, snapshot_hash), + to_label="TargetObligation", + to_keys={ + **_versioned_keys(descriptor, snapshot_hash), + "target_entity": obligation.target_entity, + }, + ) + ) + + +def _fk_payload(fk: ForeignKeyObligation) -> dict[str, object]: + return fk.model_dump(mode="json") + + +def write_term( + writer: _WriterLike, + descriptor: Any, + term: TargetTermDecl, + snapshot_hash: str, +) -> None: + writer.write_term( + TermOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + target_schema_snapshot_hash=snapshot_hash, + vocabulary_name=term.vocabulary.name, + code=term.code, + display=term.display, + ) + ) + + +def write_constraints( + writer: _WriterLike, + descriptor: Any, + entity: TargetEntityDecl, + obligation: TargetObligationDecl | None, + snapshot_hash: str, +) -> None: + if obligation is None: + return + for op in constraints_for_obligation(descriptor, entity, obligation, snapshot_hash): + writer.write_constraint(op) + + +from sema.targets.materializer_binding_card_utils import ( + write_context_cards, + write_vocabulary_bindings, +) + + +def build_current_flip_op( + descriptor: Any, + model: NormalizedTargetModel, + snapshot_hash: str, + bindings: list[VocabularyBindingDecl] | None = None, + cards: list[LoadedContextCard] | None = None, +) -> CurrentFlipOp: + entity_names = tuple(e.ref.qualified_name for e in model.entities) + property_keys = tuple( + (e.ref.qualified_name, p.name) for e in model.entities for p in e.properties + ) + obligation_targets = tuple(o.target_entity for o in model.obligations) + term_keys = tuple((t.vocabulary.name, t.code) for t in model.terms) + binding_keys = tuple( + (b.entity_ref.qualified_name, b.property_name, b.vocabulary.name) + for b in (bindings or []) + ) + card_keys = tuple( + (c.entity_ref.qualified_name, c.card_version) for c in (cards or []) + ) + return CurrentFlipOp( + target_model_id=descriptor.target_model_id, + target_model_version=descriptor.target_model_version, + current_snapshot_hash=snapshot_hash, + entity_qualified_names=entity_names, + property_keys=property_keys, + obligation_target_entities=obligation_targets, + enrichment_entity_refs=entity_names, + vocabulary_binding_keys=binding_keys, + context_card_keys=card_keys, + term_keys=term_keys, + ) + + +def _versioned_keys(descriptor: Any, snapshot_hash: str) -> dict[str, str]: + return { + "target_model_id": descriptor.target_model_id, + "target_model_version": descriptor.target_model_version, + "target_schema_snapshot_hash": snapshot_hash, + } + + +def _entity_keys( + descriptor: Any, qualified_name: str, snapshot_hash: str +) -> dict[str, str]: + return { + **_versioned_keys(descriptor, snapshot_hash), + "qualified_name": qualified_name, + } + + +__all__ = [ + "build_current_flip_op", + "constraints_for_obligation", + "write_constraints", + "write_context_cards", + "write_decision_node", + "write_entity_and_properties", + "write_obligation", + "write_term", + "write_vocabulary_bindings", +] diff --git a/src/sema/targets/neo4j_writer.py b/src/sema/targets/neo4j_writer.py new file mode 100644 index 0000000..166f894 --- /dev/null +++ b/src/sema/targets/neo4j_writer.py @@ -0,0 +1,99 @@ +"""Neo4j-backed `GraphWriter` for `TargetModelMaterializer`. + +Takes a `neo4j.Driver` directly (no `sema.graph` import) so the +import-boundary rules stay intact: only `materializer.py` is allowed +to reach into `sema.graph`. The writer's contract is a sequence of +typed `WriteOp`s; this module turns them into Cypher MERGEs. +""" + +from __future__ import annotations + +from typing import Any + +from sema.targets.materializer_ops import ( + ConstraintOp, + ContextCardOp, + CurrentFlipOp, + EnrichmentDecisionOp, + EntityOp, + PropertyOp, + RelationshipOp, + TargetObligationOp, + TermOp, + VocabularyBindingOp, +) +from sema.targets.neo4j_writer_utils import ( + constraint_merge, + context_card_merge, + enrichment_decision_merge, + entity_merge, + flip_statements, + property_merge, + relationship_merge, + target_obligation_merge, + term_merge, + vocabulary_binding_merge, +) + + +class Neo4jGraphWriter: + """`GraphWriter` that issues hash-versioned MERGEs against Neo4j. + + Each call opens a session and runs one Cypher statement. Callers + that need a single transaction (e.g., to fail-atomically across + multiple writes) should wrap a sequence of calls in their own + `driver.session()` block and use `Neo4jGraphWriter.from_session`. + """ + + def __init__(self, driver: Any) -> None: + self._driver = driver + + @classmethod + def from_session(cls, session: Any) -> "Neo4jGraphWriter": + instance = cls.__new__(cls) + instance._driver = None + instance._session = session # type: ignore[attr-defined] + return instance + + def _run(self, cypher: str, params: dict[str, Any]) -> None: + if self._driver is None: + self._session.run(cypher, **params) # type: ignore[attr-defined] + return + with self._driver.session() as session: + session.run(cypher, **params) + + def write_entity(self, op: EntityOp) -> None: + self._run(*entity_merge(op)) + + def write_property(self, op: PropertyOp) -> None: + self._run(*property_merge(op)) + + def write_term(self, op: TermOp) -> None: + self._run(*term_merge(op)) + + def write_constraint(self, op: object) -> None: + if not isinstance(op, ConstraintOp): + return + self._run(*constraint_merge(op)) + + def write_target_obligation(self, op: TargetObligationOp) -> None: + self._run(*target_obligation_merge(op)) + + def write_enrichment_decision(self, op: EnrichmentDecisionOp) -> None: + self._run(*enrichment_decision_merge(op)) + + def write_relationship(self, op: RelationshipOp) -> None: + self._run(*relationship_merge(op)) + + def write_vocabulary_binding(self, op: VocabularyBindingOp) -> None: + self._run(*vocabulary_binding_merge(op)) + + def write_context_card(self, op: ContextCardOp) -> None: + self._run(*context_card_merge(op)) + + def flip_prior_generations(self, op: CurrentFlipOp) -> None: + for cypher, params in flip_statements(op): + self._run(cypher, params) + + +__all__ = ["Neo4jGraphWriter"] diff --git a/src/sema/targets/neo4j_writer_flip_utils.py b/src/sema/targets/neo4j_writer_flip_utils.py new file mode 100644 index 0000000..ef2be11 --- /dev/null +++ b/src/sema/targets/neo4j_writer_flip_utils.py @@ -0,0 +1,182 @@ +"""Cypher flip statements for `Neo4jGraphWriter.flip_prior_generations`. + +Each statement scopes the `is_current=false` flip to logical artifacts +touched by the current load AND `target_schema_snapshot_hash <> +$current_hash`. Lazy-load preservation: artifacts outside the loaded +subset are not flipped. +""" + +from __future__ import annotations + +from typing import Any + +from sema.targets.materializer_ops import CurrentFlipOp + + +def flip_statements(op: CurrentFlipOp) -> list[tuple[str, dict[str, Any]]]: + base = { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "current_hash": op.current_snapshot_hash, + } + out: list[tuple[str, dict[str, Any]]] = [] + out.extend(_entity_flip(op, base)) + out.extend(_property_flip(op, base)) + out.extend(_obligation_flip(op, base)) + out.extend(_enrichment_flip(op, base)) + out.extend(_term_flip(op, base)) + out.extend(_constraint_flip(op, base)) + out.extend(_vocab_binding_flip(op, base)) + out.extend(_context_card_flip(op, base)) + return out + + +def _entity_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.entity_qualified_names: + return [] + return [ + ( + "MATCH (n:Entity) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.qualified_name IN $names " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "names": list(op.entity_qualified_names)}, + ) + ] + + +def _property_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.property_keys: + return [] + return [ + ( + "UNWIND $keys AS key " + "MATCH (n:Property) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.parent_entity_qualified_name = key[0] " + "AND n.name = key[1] " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "keys": [list(t) for t in op.property_keys]}, + ) + ] + + +def _obligation_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.obligation_target_entities: + return [] + return [ + ( + "MATCH (n:TargetObligation) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.target_entity IN $entities " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "entities": list(op.obligation_target_entities)}, + ) + ] + + +def _enrichment_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.enrichment_entity_refs: + return [] + return [ + ( + "MATCH (n:EnrichmentDecision) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.entity_ref IN $names " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "names": list(op.enrichment_entity_refs)}, + ) + ] + + +def _term_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.term_keys: + return [] + return [ + ( + "UNWIND $keys AS key " + "MATCH (n:Term) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.vocabulary_name = key[0] AND n.code = key[1] " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "keys": [list(t) for t in op.term_keys]}, + ) + ] + + +def _constraint_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.property_keys: + return [] + return [ + ( + "UNWIND $keys AS key " + "MATCH (n:Constraint) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.attached_property_id = key[0] + '.' + key[1] " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "keys": [list(t) for t in op.property_keys]}, + ) + ] + + +def _vocab_binding_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + if not op.vocabulary_binding_keys: + return [] + return [ + ( + "UNWIND $keys AS key " + "MATCH (n:VocabularyBinding) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.parent_entity_qualified_name = key[0] " + "AND n.property_name = key[1] AND n.vocabulary_name = key[2] " + "AND n.target_schema_snapshot_hash <> $current_hash " + "SET n.is_current = false", + {**base, "keys": [list(t) for t in op.vocabulary_binding_keys]}, + ) + ] + + +def _context_card_flip( + op: CurrentFlipOp, base: dict[str, Any] +) -> list[tuple[str, dict[str, Any]]]: + """Flip prior cards for the same entity that don't match the current + (target_schema_snapshot_hash, card_version) tuple. Card-only bumps + leave the schema hash unchanged, so flipping by snapshot hash alone + would miss them; we flip by entity-vs-current-identity instead.""" + if not op.context_card_keys: + return [] + return [ + ( + "UNWIND $keys AS key " + "MATCH (n:ContextCard) WHERE n.target_model_id = $target_model_id " + "AND n.target_model_version = $target_model_version " + "AND n.entity_qualified_name = key[0] " + "AND NOT (n.target_schema_snapshot_hash = $current_hash " + "AND n.card_version = key[1]) " + "SET n.is_current = false", + {**base, "keys": [list(t) for t in op.context_card_keys]}, + ) + ] + + +__all__ = ["flip_statements"] diff --git a/src/sema/targets/neo4j_writer_utils.py b/src/sema/targets/neo4j_writer_utils.py new file mode 100644 index 0000000..beddc04 --- /dev/null +++ b/src/sema/targets/neo4j_writer_utils.py @@ -0,0 +1,377 @@ +"""Cypher template helpers for `Neo4jGraphWriter`. + +Centralises the MERGE/SET shapes so the writer stays focused on +session orchestration. None of these helpers import `sema.graph`; +they consume only DTOs and emit Cypher strings. +""" + +from __future__ import annotations + +from typing import Any + +from sema.models.planner._enums import ModelRole +from sema.targets.materializer_ops import ( + ConstraintOp, + ContextCardOp, + EnrichmentDecisionOp, + EntityOp, + PropertyOp, + RelationshipOp, + TargetObligationOp, + TermOp, + VocabularyBindingOp, +) + + +_TARGET_ROLE = ModelRole.TARGET.value + + +def entity_merge(op: EntityOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:Entity {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "qualified_name: $qualified_name" + "}) " + "SET n.kind = $kind, n.is_current = $is_current, " + "n.model_role = $model_role, " + "n.id = $id, " + f"{_enrichment_status_assign()}" + ) + params = { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "qualified_name": op.qualified_name, + "kind": op.kind, + "is_current": op.is_current, + "model_role": _TARGET_ROLE, + "id": _entity_id(op), + **_enrichment_status_params(op.enrichment_status), + } + return cypher, params + + +def _enrichment_status_assign() -> str: + fields = ( + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + ) + return ", ".join( + f"n.enrichment_{f}_status = $enrichment_{f}_status" for f in fields + ) + + +def _enrichment_status_params(status: dict[str, str]) -> dict[str, Any]: + fields = ( + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + ) + return {f"enrichment_{f}_status": status.get(f) for f in fields} + + +def _entity_id(op: EntityOp) -> str: + return ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|{op.qualified_name}" + ) + + +def property_merge(op: PropertyOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:Property {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "parent_entity_qualified_name: $parent_entity_qualified_name, " + "name: $name" + "}) " + "SET n.type = $type, n.nullable = $nullable, " + "n.synonyms = $synonyms, n.decoded_values_json = $decoded_values_json, " + "n.property_kind = $property_kind, " + "n.endpoint_role = $endpoint_role, " + "n.endpoint_target_entity_qualified_name = " + "$endpoint_target_entity_qualified_name, " + "n.endpoint_cardinality = $endpoint_cardinality, " + "n.endpoint_nullable = $endpoint_nullable, " + "n.materialized_as_edge_property = $materialized_as_edge_property, " + "n.is_current = $is_current, " + "n.model_role = $model_role, n.id = $id" + ) + params = { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "parent_entity_qualified_name": op.parent_entity_qualified_name, + "name": op.name, + "type": op.type, + "nullable": op.nullable, + "synonyms": list(op.synonyms), + "decoded_values_json": _json_dumps(op.decoded_values), + "property_kind": op.property_kind, + "endpoint_role": op.endpoint_role, + "endpoint_target_entity_qualified_name": op.endpoint_target_entity_qualified_name, + "endpoint_cardinality": op.endpoint_cardinality, + "endpoint_nullable": op.endpoint_nullable, + "materialized_as_edge_property": op.materialized_as_edge_property, + "is_current": op.is_current, + "model_role": _TARGET_ROLE, + "id": _property_id(op), + } + return cypher, params + + +def _property_id(op: PropertyOp) -> str: + return ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|" + f"{op.parent_entity_qualified_name}.{op.name}" + ) + + +def _stringify(payload: dict[str, Any]) -> dict[str, str]: + return {str(k): str(v) for k, v in payload.items()} + + +def _json_dumps(payload: dict[str, Any]) -> str: + import json as _json + + return _json.dumps(payload, sort_keys=True) + + +def term_merge(op: TermOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:Term {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "vocabulary_name: $vocabulary_name, " + "code: $code" + "}) " + "SET n.display = $display, n.is_current = $is_current, " + "n.model_role = $model_role, n.id = $id" + ) + return cypher, { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "vocabulary_name": op.vocabulary_name, + "code": op.code, + "display": op.display, + "is_current": True, + "model_role": _TARGET_ROLE, + "id": ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|{op.vocabulary_name}|{op.code}" + ), + } + + +def constraint_merge(op: ConstraintOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:Constraint {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "attached_property_id: $attached_property_id, " + "constraint_kind: $constraint_kind, " + "payload_hash: $payload_hash" + "}) " + "SET n.payload_json = $payload_json, n.is_current = $is_current, " + "n.model_role = $model_role, n.id = $id" + ) + import json as _json + + return cypher, { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "attached_property_id": op.attached_property_id, + "constraint_kind": op.constraint_kind, + "payload_hash": op.payload_hash, + "payload_json": _json.dumps(op.payload, sort_keys=True), + "is_current": op.is_current, + "model_role": _TARGET_ROLE, + "id": ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|{op.attached_property_id}|" + f"{op.constraint_kind}|{op.payload_hash}" + ), + } + + +def target_obligation_merge(op: TargetObligationOp) -> tuple[str, dict[str, Any]]: + import json as _json + + cypher = ( + "MERGE (n:TargetObligation {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "target_entity: $target_entity" + "}) " + "SET n.payload_json = $payload_json, n.is_current = $is_current, " + "n.model_role = $model_role, n.id = $id" + ) + return cypher, { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "target_entity": op.target_entity, + "payload_json": _json.dumps(op.payload, sort_keys=True, default=str), + "is_current": True, + "model_role": _TARGET_ROLE, + "id": ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|{op.target_entity}" + ), + } + + +def enrichment_decision_merge(op: EnrichmentDecisionOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:EnrichmentDecision {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "entity_ref: $entity_ref" + "}) " + "SET n.decisions_json = $decisions_json, n.decided_at = $decided_at, " + "n.is_current = $is_current, n.model_role = $model_role, n.id = $id" + ) + return cypher, { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "entity_ref": op.entity_ref, + "decisions_json": op.decisions_json, + "decided_at": op.decided_at.isoformat(), + "is_current": op.is_current, + "model_role": _TARGET_ROLE, + "id": ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|{op.entity_ref}" + ), + } + + +def vocabulary_binding_merge(op: VocabularyBindingOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:VocabularyBinding {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "parent_entity_qualified_name: $parent_entity_qualified_name, " + "property_name: $property_name, " + "vocabulary_name: $vocabulary_name" + "}) " + "SET n.vocabulary_source = $vocabulary_source, n.domain = $domain, " + "n.require_standard = $require_standard, " + "n.allow_zero_default = $allow_zero_default, " + "n.effective_date_ref = $effective_date_ref, " + "n.resolver_policy_ref = $resolver_policy_ref, " + "n.is_current = $is_current, n.model_role = $model_role, n.id = $id" + ) + return cypher, { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "parent_entity_qualified_name": op.parent_entity_qualified_name, + "property_name": op.property_name, + "vocabulary_name": op.vocabulary_name, + "vocabulary_source": op.vocabulary_source, + "domain": op.domain, + "require_standard": op.require_standard, + "allow_zero_default": op.allow_zero_default, + "effective_date_ref": op.effective_date_ref, + "resolver_policy_ref": op.resolver_policy_ref, + "is_current": op.is_current, + "model_role": _TARGET_ROLE, + "id": ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|" + f"{op.parent_entity_qualified_name}.{op.property_name}|" + f"{op.vocabulary_name}" + ), + } + + +def context_card_merge(op: ContextCardOp) -> tuple[str, dict[str, Any]]: + cypher = ( + "MERGE (n:ContextCard {" + "target_model_id: $target_model_id, " + "target_model_version: $target_model_version, " + "target_schema_snapshot_hash: $target_schema_snapshot_hash, " + "entity_qualified_name: $entity_qualified_name, " + "card_version: $card_version" + "}) " + "SET n.card_hash = $card_hash, " + "n.description = $description, n.examples = $examples, " + "n.obligation_summary = $obligation_summary, " + "n.curated_synonyms = $curated_synonyms, " + "n.is_current = $is_current, n.model_role = $model_role, n.id = $id" + ) + return cypher, { + "target_model_id": op.target_model_id, + "target_model_version": op.target_model_version, + "target_schema_snapshot_hash": op.target_schema_snapshot_hash, + "entity_qualified_name": op.entity_qualified_name, + "card_version": op.card_version, + "card_hash": op.card_hash, + "description": op.description, + "examples": list(op.examples), + "obligation_summary": op.obligation_summary, + "curated_synonyms": list(op.curated_synonyms), + "is_current": op.is_current, + "model_role": _TARGET_ROLE, + "id": ( + f"{op.target_model_id}|{op.target_model_version}|" + f"{op.target_schema_snapshot_hash}|{op.entity_qualified_name}|" + f"{op.card_version}" + ), + } + + +def relationship_merge(op: RelationshipOp) -> tuple[str, dict[str, Any]]: + from_match = _key_predicate("a", "from", op.from_keys) + to_match = _key_predicate("b", "to", op.to_keys) + cypher = ( + f"MATCH (a:{op.from_label}) WHERE {from_match} " + f"MATCH (b:{op.to_label}) WHERE {to_match} " + f"MERGE (a)-[:{op.rel_type} " + "{target_schema_snapshot_hash: $rel_snapshot_hash}]->(b)" + ) + params: dict[str, Any] = {"rel_snapshot_hash": op.target_schema_snapshot_hash} + for k, v in op.from_keys.items(): + params[f"from_{k}"] = v + for k, v in op.to_keys.items(): + params[f"to_{k}"] = v + return cypher, params + + +def _key_predicate(var: str, prefix: str, keys: dict[str, str]) -> str: + parts = [f"{var}.{k} = ${prefix}_{k}" for k in keys] + return " AND ".join(parts) + + +from sema.targets.neo4j_writer_flip_utils import flip_statements + +__all__ = [ + "constraint_merge", + "context_card_merge", + "enrichment_decision_merge", + "entity_merge", + "flip_statements", + "property_merge", + "relationship_merge", + "target_obligation_merge", + "term_merge", + "vocabulary_binding_merge", +] diff --git a/src/sema/targets/normalizer.py b/src/sema/targets/normalizer.py new file mode 100644 index 0000000..92268f3 --- /dev/null +++ b/src/sema/targets/normalizer.py @@ -0,0 +1,123 @@ +"""TargetModelNormalizer — adapter output → NormalizedTargetModel.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.normalized import NormalizedTargetModel +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.refs import TargetEntityRef, TargetPropertyRef, VocabularyRef +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.base import TargetOntologyAdapter +from sema.targets.normalizer_utils import ( + normalize_entity, + sort_bindings, + sort_context_cards, + sort_entities, + sort_obligations, + sort_terms, + validate_endpoint_targets, + validate_foreign_keys, + validate_obligation_required_fields, + validate_vocabulary_bindings, +) + + +class TargetModelNormalizer: + """Validates DTOs, resolves cross-refs, sorts collections.""" + + @staticmethod + def normalize( + adapter: TargetOntologyAdapter, + selected_refs: Iterable[TargetEntityRef] | None = None, + ) -> NormalizedTargetModel: + descriptor = adapter.describe() + refs = _select_refs(adapter, selected_refs) + raw_entities = [adapter.load_entity(ref) for ref in refs] + entities = [normalize_entity(e) for e in raw_entities] + obligations = [adapter.load_obligation(ref) for ref in refs] + bindings = _collect_bindings(adapter, entities) + terms = _collect_terms(adapter, bindings) + cards = [adapter.load_context_card(ref) for ref in refs] + _resolve_cross_refs(entities, obligations, bindings, terms) + return NormalizedTargetModel( + descriptor=descriptor, + entities=sort_entities(entities), + obligations=sort_obligations(obligations), + vocabularies=_collect_vocabularies(bindings, terms), + vocabulary_bindings=sort_bindings(bindings), + terms=sort_terms(terms), + context_cards=sort_context_cards(cards), + ) + + +def _select_refs( + adapter: TargetOntologyAdapter, + selected_refs: Iterable[TargetEntityRef] | None, +) -> list[TargetEntityRef]: + if selected_refs is None: + return list(adapter.discover_entities()) + return list(selected_refs) + + +def _collect_bindings( + adapter: TargetOntologyAdapter, entities: list[TargetEntityDecl] +) -> list[VocabularyBindingDecl]: + bindings: list[VocabularyBindingDecl] = [] + for entity in entities: + for prop in entity.properties: + ref = TargetPropertyRef(entity_ref=entity.ref, property_name=prop.name) + bindings.extend(adapter.load_vocabulary_bindings(ref)) + return bindings + + +def _collect_terms( + adapter: TargetOntologyAdapter, bindings: list[VocabularyBindingDecl] +) -> list[TargetTermDecl]: + terms: list[TargetTermDecl] = [] + seen: set[str] = set() + iter_terms = getattr(adapter, "iter_terms", None) + if iter_terms is None: + return terms + for binding in bindings: + vocab = binding.vocabulary + if vocab.name in seen: + continue + seen.add(vocab.name) + try: + terms.extend(iter_terms(vocab)) + except NotImplementedError: + continue + return terms + + +def _collect_vocabularies( + bindings: list[VocabularyBindingDecl], terms: list[TargetTermDecl] +) -> list[VocabularyRef]: + seen: dict[str, VocabularyRef] = {} + for binding in bindings: + seen.setdefault(binding.vocabulary.name, binding.vocabulary) + for term in terms: + seen.setdefault(term.vocabulary.name, term.vocabulary) + return sorted(seen.values(), key=lambda v: v.name) + + +def _resolve_cross_refs( + entities: list[TargetEntityDecl], + obligations: list[TargetObligationDecl], + bindings: list[VocabularyBindingDecl], + terms: list[TargetTermDecl], +) -> None: + by_qualified = {e.ref.qualified_name: e for e in entities} + properties_by_entity = {e.ref.qualified_name: list(e.properties) for e in entities} + for entity in entities: + validate_endpoint_targets(entity, by_qualified) + for obligation in obligations: + validate_obligation_required_fields(obligation, properties_by_entity) + validate_foreign_keys(obligation, by_qualified) + validate_vocabulary_bindings(bindings, terms) + + +__all__ = ["TargetModelNormalizer"] diff --git a/src/sema/targets/normalizer_utils.py b/src/sema/targets/normalizer_utils.py new file mode 100644 index 0000000..155ef6a --- /dev/null +++ b/src/sema/targets/normalizer_utils.py @@ -0,0 +1,162 @@ +"""Helpers for the target-model normalizer. + +Cross-reference resolution, endpoint-property synthesis, stable ordering. +""" + +from __future__ import annotations + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.context_card import TargetContextCard +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import PropertyKind, TargetPropertyDecl +from sema.models.target.refs import VocabularySource +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.exceptions import DanglingRefError + + +def synthesize_endpoint_properties(entity: TargetEntityDecl) -> list[TargetPropertyDecl]: + if entity.endpoints is None: + return [] + return [ + _build_endpoint_property(entity.endpoints.subject), + _build_endpoint_property(entity.endpoints.object), + ] + + +def _build_endpoint_property(endpoint: object) -> TargetPropertyDecl: + role = endpoint.role # type: ignore[attr-defined] + return TargetPropertyDecl( + name=role, + type="endpoint", + nullable=endpoint.nullable, # type: ignore[attr-defined] + property_kind=PropertyKind.ENDPOINT, + endpoint_role=role, + endpoint_target_entity_qualified_name=endpoint.target_entity.qualified_name, # type: ignore[attr-defined] + endpoint_cardinality=endpoint.cardinality, # type: ignore[attr-defined] + endpoint_nullable=endpoint.nullable, # type: ignore[attr-defined] + materialized_as_edge_property=False, + ) + + +def normalize_entity(entity: TargetEntityDecl) -> TargetEntityDecl: + synthesized = synthesize_endpoint_properties(entity) + if not synthesized: + sorted_props = sorted(entity.properties, key=lambda p: p.name) + if list(entity.properties) == sorted_props: + return entity + return entity.model_copy(update={"properties": sorted_props}) + combined = sorted(list(entity.properties) + synthesized, key=lambda p: p.name) + return entity.model_copy(update={"properties": combined}) + + +def sort_entities(entities: list[TargetEntityDecl]) -> list[TargetEntityDecl]: + return sorted(entities, key=lambda e: (e.ref.target_model_id, e.ref.qualified_name)) + + +def sort_obligations(obligations: list[TargetObligationDecl]) -> list[TargetObligationDecl]: + return sorted(obligations, key=lambda o: o.target_entity) + + +def sort_bindings( + bindings: list[VocabularyBindingDecl], +) -> list[VocabularyBindingDecl]: + return sorted( + bindings, + key=lambda b: (b.entity_ref.qualified_name, b.property_name, b.vocabulary.name), + ) + + +def sort_terms(terms: list[TargetTermDecl]) -> list[TargetTermDecl]: + return sorted(terms, key=lambda t: (t.vocabulary.name, t.code)) + + +def sort_context_cards(cards: list[TargetContextCard]) -> list[TargetContextCard]: + return sorted(cards, key=lambda c: c.entity_ref.qualified_name) + + +def validate_obligation_required_fields( + obligation: TargetObligationDecl, + properties_by_entity: dict[str, list[TargetPropertyDecl]], +) -> None: + props = properties_by_entity.get(obligation.target_entity) + if props is None: + raise DanglingRefError( + f"TargetObligationDecl(target_entity={obligation.target_entity!r}) " + f"references no entity declared in the loaded model" + ) + names = {p.name for p in props} + missing = [field for field in obligation.required_fields if field not in names] + if missing: + raise DanglingRefError( + f"TargetObligationDecl(target_entity={obligation.target_entity!r}) " + f"required_fields {missing} not present on entity" + ) + + +def validate_foreign_keys( + obligation: TargetObligationDecl, + entities_by_qualified_name: dict[str, TargetEntityDecl], +) -> None: + for fk in obligation.foreign_keys: + if fk.referenced_entity not in entities_by_qualified_name: + raise DanglingRefError( + f"TargetObligationDecl(target_entity={obligation.target_entity!r}) " + f"foreign_key references unknown entity {fk.referenced_entity!r}" + ) + + +def validate_endpoint_targets( + entity: TargetEntityDecl, + entities_by_qualified_name: dict[str, TargetEntityDecl], +) -> None: + if entity.endpoints is None: + return + for role, endpoint in (("subject", entity.endpoints.subject), ("object", entity.endpoints.object)): + target = entities_by_qualified_name.get(endpoint.target_entity.qualified_name) + if target is None: + raise DanglingRefError( + f"GRAPH_EDGE {entity.ref.qualified_name} endpoint {role!r} targets " + f"missing entity {endpoint.target_entity.qualified_name!r}" + ) + if target.ref.kind is TargetArtifactKind.TABLE_ROW: + raise DanglingRefError( + f"GRAPH_EDGE {entity.ref.qualified_name} endpoint {role!r} targets " + f"TABLE_ROW entity {endpoint.target_entity.qualified_name!r}; " + f"endpoints MUST reference GRAPH_NODE or GRAPH_EDGE entities" + ) + + +def validate_vocabulary_bindings( + bindings: list[VocabularyBindingDecl], + terms: list[TargetTermDecl], +) -> None: + inline_term_names = { + t.vocabulary.name for t in terms if t.vocabulary.source is VocabularySource.INLINE + } + for binding in bindings: + vocab = binding.vocabulary + if vocab.source is VocabularySource.EXTERNAL: + continue + if vocab.name not in inline_term_names: + raise DanglingRefError( + f"VocabularyBindingDecl({binding.entity_ref.qualified_name}.{binding.property_name}) " + f"references INLINE vocabulary {vocab.name!r} but no inline terms found; " + f"declare source=EXTERNAL or supply matching TargetTermDecl" + ) + + +__all__ = [ + "synthesize_endpoint_properties", + "normalize_entity", + "sort_entities", + "sort_obligations", + "sort_bindings", + "sort_terms", + "sort_context_cards", + "validate_obligation_required_fields", + "validate_foreign_keys", + "validate_endpoint_targets", + "validate_vocabulary_bindings", +] diff --git a/src/sema/targets/registry.py b/src/sema/targets/registry.py new file mode 100644 index 0000000..a96de72 --- /dev/null +++ b/src/sema/targets/registry.py @@ -0,0 +1,231 @@ +"""Version-aware target ontology adapter registry.""" + +from __future__ import annotations + +from collections.abc import Callable +from importlib.metadata import entry_points + +from packaging.specifiers import SpecifierSet +from packaging.version import Version + +from sema.targets.base import REQUIRED_METHODS, TargetOntologyAdapter +from sema.targets.exceptions import ( + AmbiguousAdapterError, + NoMatchingAdapterError, + UnknownAdapterError, +) +from sema.targets.registry_utils import check_no_overlap, parse_supported_versions + +_Key = tuple[str, str] + + +class _Registration: + __slots__ = ( + "adapter_id", + "target_model_id", + "supported_versions", + "specifier_set", + "cls", + "wildcard_target_model_id", + ) + + def __init__( + self, + adapter_id: str, + target_model_id: str, + supported_versions: str, + specifier_set: SpecifierSet, + cls: type, + wildcard_target_model_id: bool = False, + ) -> None: + self.adapter_id = adapter_id + self.target_model_id = target_model_id + self.supported_versions = supported_versions + self.specifier_set = specifier_set + self.cls = cls + self.wildcard_target_model_id = wildcard_target_model_id + + +class _Registry: + def __init__(self) -> None: + self._by_key: dict[_Key, list[_Registration]] = {} + + def clear(self) -> None: + self._by_key.clear() + + def register( + self, + adapter_id: str, + target_model_id: str, + supported_versions: str, + cls: type, + wildcard_target_model_id: bool = False, + ) -> None: + _ensure_protocol_methods(cls) + spec = parse_supported_versions(supported_versions) + key = (adapter_id, target_model_id) + existing = self._by_key.get(key, []) + check_no_overlap( + supported_versions, + spec, + ((r.supported_versions, r.specifier_set) for r in existing), + adapter_id, + target_model_id, + ) + registration = _Registration( + adapter_id, + target_model_id, + supported_versions, + spec, + cls, + wildcard_target_model_id=wildcard_target_model_id, + ) + self._by_key.setdefault(key, []).append(registration) + + def get( + self, + adapter_id: str, + target_model_id: str, + target_model_version: str | None = None, + ) -> type: + registrations = self._lookup_or_raise(adapter_id, target_model_id) + if target_model_version is None: + return _resolve_versionless(adapter_id, target_model_id, registrations) + return _resolve_with_version( + adapter_id, target_model_id, target_model_version, registrations + ) + + def list_all(self) -> list[tuple[str, str, str]]: + rows = [ + (r.adapter_id, r.target_model_id, r.supported_versions) + for regs in self._by_key.values() + for r in regs + ] + return sorted(rows) + + def discover_entry_points(self, group: str) -> list[type]: + eps = entry_points(group=group) + return [ep.load() for ep in eps] + + def _lookup_or_raise( + self, adapter_id: str, target_model_id: str + ) -> list[_Registration]: + registrations = self._by_key.get((adapter_id, target_model_id)) + if registrations: + return registrations + wildcard = self._wildcard_registrations(adapter_id) + if wildcard: + return wildcard + if not any(k[0] == adapter_id for k in self._by_key): + ids = sorted({k[0] for k in self._by_key}) + raise UnknownAdapterError( + f"unknown adapter_id={adapter_id!r}; registered ids={ids}" + ) + models = sorted({k[1] for k in self._by_key if k[0] == adapter_id}) + raise UnknownAdapterError( + f"unknown target_model_id={target_model_id!r} for adapter_id={adapter_id!r}; " + f"registered models={models}" + ) + + def _wildcard_registrations(self, adapter_id: str) -> list[_Registration]: + return [ + r + for regs in self._by_key.values() + for r in regs + if r.adapter_id == adapter_id and r.wildcard_target_model_id + ] + + +def _ensure_protocol_methods(cls: type) -> None: + missing = [m for m in REQUIRED_METHODS if not callable(getattr(cls, m, None))] + if missing: + raise TypeError( + f"{cls.__qualname__} is missing required TargetOntologyAdapter methods: {missing}" + ) + + +def _resolve_versionless( + adapter_id: str, target_model_id: str, registrations: list[_Registration] +) -> type: + if len(registrations) == 1: + return registrations[0].cls + candidates = [r.supported_versions for r in registrations] + raise AmbiguousAdapterError( + f"({adapter_id!r}, {target_model_id!r}) has multiple registrations " + f"{candidates}; specify target_model_version to disambiguate" + ) + + +def _resolve_with_version( + adapter_id: str, + target_model_id: str, + target_model_version: str, + registrations: list[_Registration], +) -> type: + version = Version(target_model_version) + matches = [r for r in registrations if r.specifier_set.contains(version, prereleases=True)] + if len(matches) == 1: + return matches[0].cls + if len(matches) > 1: + candidates = [m.supported_versions for m in matches] + raise AmbiguousAdapterError( + f"version {target_model_version!r} matches multiple registrations " + f"{candidates} for ({adapter_id!r}, {target_model_id!r})" + ) + registered = [r.supported_versions for r in registrations] + raise NoMatchingAdapterError( + f"no registration matches version {target_model_version!r} for " + f"({adapter_id!r}, {target_model_id!r}); registered ranges={registered}" + ) + + +_REGISTRY = _Registry() + + +def register_target_adapter( + *, + adapter_id: str, + target_model_id: str, + supported_versions: str = "", + wildcard_target_model_id: bool = False, +) -> Callable[[type], type]: + def decorator(cls: type) -> type: + _REGISTRY.register( + adapter_id, + target_model_id, + supported_versions, + cls, + wildcard_target_model_id=wildcard_target_model_id, + ) + return cls + + return decorator + + +def get( + adapter_id: str, + target_model_id: str, + target_model_version: str | None = None, +) -> type: + return _REGISTRY.get(adapter_id, target_model_id, target_model_version) + + +def list_registered() -> list[tuple[str, str, str]]: + return _REGISTRY.list_all() + + +def discover_entry_points(group: str = "sema.target_adapters") -> list[type]: + return _REGISTRY.discover_entry_points(group) + + +def _clear_for_tests() -> None: + _REGISTRY.clear() + + +__all__ = [ + "TargetOntologyAdapter", + "register_target_adapter", + "get", + "list_registered", + "discover_entry_points", +] diff --git a/src/sema/targets/registry_utils.py b/src/sema/targets/registry_utils.py new file mode 100644 index 0000000..a4d4f4c --- /dev/null +++ b/src/sema/targets/registry_utils.py @@ -0,0 +1,125 @@ +"""Registry helpers: PEP 440 specifier handling and overlap detection.""" + +from __future__ import annotations + +import re +from collections.abc import Iterable + +from packaging.specifiers import SpecifierSet +from packaging.version import Version + +from sema.targets.exceptions import ( + AdapterRegistryError, + OverlappingVersionRangeError, +) + + +_VERSION_LITERAL_RE = re.compile(r"(\d+(?:\.\d+)*(?:[a-zA-Z]\w*)?(?:\+[\w.]+)?)") + + +def parse_supported_versions(supported_versions: str) -> SpecifierSet: + if supported_versions == "*": + raise AdapterRegistryError( + "supported_versions=='*' is not a PEP 440 specifier; " + "use the empty string '' as the canonical wildcard form" + ) + try: + return SpecifierSet(supported_versions) + except Exception as exc: + raise AdapterRegistryError( + f"supported_versions={supported_versions!r} is not a valid PEP 440 specifier" + ) from exc + + +def _extract_boundary_versions(spec_set: SpecifierSet) -> set[str]: + boundaries: set[str] = set() + for spec in spec_set: + match = _VERSION_LITERAL_RE.search(str(spec)) + if match is not None: + boundaries.add(match.group(1)) + return boundaries + + +def ranges_overlap(s1: SpecifierSet, s2: SpecifierSet) -> bool: + """Detect whether two PEP 440 specifier sets share any version. + + Strategy: take the union of literal anchor versions appearing in + either set, then nudge each anchor up (`.post0`) and down + (`.dev0`) to detect overlaps that miss the anchors themselves + (e.g. `>1,<3` vs `>2,<4` overlap on `(2, 3)` even though neither + set contains the literal anchors `2`, `3`). + """ + if len(list(s1)) == 0 or len(list(s2)) == 0: + return True + anchors = _extract_boundary_versions(s1) | _extract_boundary_versions(s2) + for v in _candidate_versions(anchors): + if s1.contains(v, prereleases=True) and s2.contains(v, prereleases=True): + return True + return False + + +def _candidate_versions(anchors: set[str]) -> list[Version]: + out: list[Version] = [] + for raw in anchors: + for nudged in _nudges_around(raw): + try: + out.append(Version(nudged)) + except Exception: + continue + out.extend(_between_anchor_candidates(anchors)) + return out + + +def _nudges_around(raw: str) -> tuple[str, ...]: + return ( + raw, + f"{raw}.0.0.0.1", + f"{raw}.post0", + f"{raw}.dev0", + ) + + +def _between_anchor_candidates(anchors: set[str]) -> list[Version]: + """For every adjacent pair of anchor versions, emit an integer-major + candidate strictly between them when one exists. Catches strict- + inequality overlaps like `>1,<3` vs `>2,<4` (overlap on (2, 3)).""" + parsed: list[Version] = [] + for raw in anchors: + try: + parsed.append(Version(raw)) + except Exception: + continue + parsed.sort() + out: list[Version] = [] + for low, high in zip(parsed, parsed[1:]): + for between in _integer_candidates_between(low, high): + out.append(between) + return out + + +def _integer_candidates_between(low: Version, high: Version) -> list[Version]: + low_major = low.release[0] if low.release else 0 + high_major = high.release[0] if high.release else 0 + out: list[Version] = [] + for n in range(low_major + 1, high_major + 1): + try: + out.append(Version(f"{n}.0.0.0.1")) + except Exception: + continue + return out + + +def check_no_overlap( + new_str: str, + new_set: SpecifierSet, + existing: Iterable[tuple[str, SpecifierSet]], + adapter_id: str, + target_model_id: str, +) -> None: + for existing_str, existing_set in existing: + if ranges_overlap(new_set, existing_set): + raise OverlappingVersionRangeError( + f"supported_versions={new_str!r} overlaps existing registration " + f"{existing_str!r} for (adapter_id={adapter_id!r}, " + f"target_model_id={target_model_id!r}); supported ranges MUST be disjoint" + ) diff --git a/tests/integration/targets/__init__.py b/tests/integration/targets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/targets/test_target_loader_round_trip.py b/tests/integration/targets/test_target_loader_round_trip.py new file mode 100644 index 0000000..1971a97 --- /dev/null +++ b/tests/integration/targets/test_target_loader_round_trip.py @@ -0,0 +1,372 @@ +"""Neo4j round-trip integration tests for `TargetModelMaterializer`. + +Covers tasks 5.17–5.20b and 8.1: golden-manifest materialization round +trip; idempotency under repeat loads; coexisting hash-versioned +generations; `is_current` scoped flip; cross-generation relationship +absence; enrichment-status indexed query path; endpoint property +materialization; FieldMap targeting an endpoint property. Skipped +automatically when no Neo4j is reachable. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from sema.graph.target_loader_migrations import cypher_down, cypher_up +from sema.targets.adapters.manifest import ManifestTargetAdapter +from sema.targets.loader import load_target +from sema.targets.neo4j_writer import Neo4jGraphWriter + +pytestmark = pytest.mark.integration + + +_GOLDEN = Path(__file__).resolve().parents[2] / "unit" / "targets" / "fixtures" / "golden_manifest.yaml" +_GOLDEN_HASH = (_GOLDEN.parent / "golden_manifest_hash.txt").read_text().strip() + + +@pytest.fixture +def migrated_neo4j(clean_neo4j): + with clean_neo4j.session() as session: + for stmt in cypher_up(): + session.run(stmt) + yield clean_neo4j + with clean_neo4j.session() as session: + for stmt in cypher_down(): + session.run(stmt) + + +def _writer(driver) -> Neo4jGraphWriter: + return Neo4jGraphWriter(driver) + + +def test_golden_manifest_round_trip_writes_target_role_nodes(migrated_neo4j) -> None: + adapter = ManifestTargetAdapter(_GOLDEN) + loaded = load_target(adapter, writer=_writer(migrated_neo4j)) + assert loaded.target_schema_snapshot_hash == _GOLDEN_HASH + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (n:Entity {target_schema_snapshot_hash: $h, model_role: 'TARGET'}) " + "RETURN n.qualified_name AS qname", + h=_GOLDEN_HASH, + ) + ) + qnames = {r["qname"] for r in rows} + assert "omop.person" in qnames + assert "omop.observation" in qnames + + +def test_idempotent_repeat_load_does_not_duplicate(migrated_neo4j) -> None: + adapter = ManifestTargetAdapter(_GOLDEN) + load_target(adapter, writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + first_nodes = s.run("MATCH (n) RETURN count(n) AS c").single()["c"] + first_rels = s.run("MATCH ()-[r]->() RETURN count(r) AS c").single()["c"] + load_target(adapter, writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + second_nodes = s.run("MATCH (n) RETURN count(n) AS c").single()["c"] + second_rels = s.run("MATCH ()-[r]->() RETURN count(r) AS c").single()["c"] + assert first_nodes == second_nodes + assert first_rels == second_rels + + +def test_two_generations_coexist_with_distinct_snapshot_hashes( + migrated_neo4j, tmp_path +) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + raw = yaml.safe_load(_GOLDEN.read_text()) + raw["entities"][0]["properties"][0]["type"] = "string" + drifted = tmp_path / "drifted.yaml" + drifted.write_text(yaml.safe_dump(raw)) + drifted_loaded = load_target( + ManifestTargetAdapter(drifted), writer=_writer(migrated_neo4j) + ) + assert drifted_loaded.target_schema_snapshot_hash != _GOLDEN_HASH + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (n:Entity {qualified_name: 'omop.person'}) " + "RETURN DISTINCT n.target_schema_snapshot_hash AS h" + ) + ) + hashes = {r["h"] for r in rows} + assert _GOLDEN_HASH in hashes + assert drifted_loaded.target_schema_snapshot_hash in hashes + + +def test_is_current_flip_scoped_to_loaded_subset( + migrated_neo4j, tmp_path +) -> None: + eager = ManifestTargetAdapter(_GOLDEN) + load_target(eager, writer=_writer(migrated_neo4j)) + raw = yaml.safe_load(_GOLDEN.read_text()) + raw["entities"][0]["properties"][0]["type"] = "string" + drifted = tmp_path / "drifted.yaml" + drifted.write_text(yaml.safe_dump(raw)) + drifted_adapter = ManifestTargetAdapter(drifted) + person_ref = next( + r for r in drifted_adapter.discover_entities() if r.qualified_name == "omop.person" + ) + drifted_loaded = load_target( + drifted_adapter, + writer=_writer(migrated_neo4j), + selected_refs=[person_ref], + ) + with migrated_neo4j.session() as s: + person_currents = list( + s.run( + "MATCH (n:Entity {qualified_name: 'omop.person'}) " + "RETURN n.target_schema_snapshot_hash AS h, n.is_current AS c" + ) + ) + observation_currents = list( + s.run( + "MATCH (n:Entity {qualified_name: 'omop.observation'}) " + "RETURN n.target_schema_snapshot_hash AS h, n.is_current AS c" + ) + ) + person_status = {r["h"]: r["c"] for r in person_currents} + assert person_status[_GOLDEN_HASH] is False + assert person_status[drifted_loaded.target_schema_snapshot_hash] is True + for r in observation_currents: + assert r["c"] is True + + +def test_no_cross_generation_has_property_relationships( + migrated_neo4j, tmp_path +) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + raw = yaml.safe_load(_GOLDEN.read_text()) + raw["entities"][0]["properties"][0]["type"] = "string" + drifted = tmp_path / "drifted.yaml" + drifted.write_text(yaml.safe_dump(raw)) + drifted_loaded = load_target( + ManifestTargetAdapter(drifted), writer=_writer(migrated_neo4j) + ) + new_hash = drifted_loaded.target_schema_snapshot_hash + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (e:Entity)-[r:HAS_PROPERTY]->(p:Property) " + "WHERE e.target_schema_snapshot_hash <> p.target_schema_snapshot_hash " + "RETURN count(*) AS c" + ) + ) + assert rows[0]["c"] == 0 + endpoint_check = list( + s.run( + "MATCH (e:Entity {target_schema_snapshot_hash: $a}) " + "-[r:HAS_PROPERTY]->" + "(p:Property {target_schema_snapshot_hash: $b}) " + "RETURN count(*) AS c", + a=_GOLDEN_HASH, + b=new_hash, + ) + ) + assert endpoint_check[0]["c"] == 0 + + +def test_enrichment_status_indexed_query_returns_deferred_entities( + migrated_neo4j, +) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (n:Entity {is_current: true}) " + "WHERE n.enrichment_vocabulary_bindings_status = 'required_deferred' " + "RETURN n.qualified_name AS qname" + ) + ) + assert any(r["qname"] for r in rows) + + +def test_endpoint_property_carries_endpoint_typing_in_neo4j( + migrated_neo4j, +) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (p:Property {property_kind: 'ENDPOINT'}) " + "WHERE p.parent_entity_qualified_name = 'acris.OWNS' " + "RETURN p.name AS name, p.endpoint_role AS role, " + "p.endpoint_target_entity_qualified_name AS tgt, " + "p.materialized_as_edge_property AS mat_edge" + ) + ) + by_name = {r["name"]: r for r in rows} + assert "subject" in by_name and "object" in by_name + for name, expected in (("subject", "subject"), ("object", "object")): + assert by_name[name]["role"] == expected + assert by_name[name]["mat_edge"] is False + assert by_name[name]["tgt"] + + +def test_endpoint_property_has_no_separate_constraint(migrated_neo4j) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (c:Constraint) " + "WHERE c.attached_property_id IN ['acris.OWNS.subject', 'acris.OWNS.object'] " + "RETURN count(c) AS c" + ) + ) + assert rows[0]["c"] == 0 + + +def test_vocabulary_bindings_persist_to_neo4j(migrated_neo4j) -> None: + loaded = load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (b:VocabularyBinding {is_current: true, " + "target_schema_snapshot_hash: $h}) " + "RETURN b.parent_entity_qualified_name AS entity, " + "b.property_name AS prop, b.vocabulary_name AS vocab, " + "b.domain AS domain, b.require_standard AS req", + h=loaded.target_schema_snapshot_hash, + ) + ) + assert any( + r["entity"] == "omop.person" + and r["prop"] == "gender_concept_id" + and r["vocab"] == "GENDER_CV" + for r in rows + ) + + +def test_context_cards_persist_with_hash_and_content(migrated_neo4j) -> None: + loaded = load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (c:ContextCard {is_current: true, " + "target_schema_snapshot_hash: $h}) " + "RETURN c.entity_qualified_name AS qname, " + "c.card_version AS version, c.card_hash AS card_hash, " + "c.description AS description", + h=loaded.target_schema_snapshot_hash, + ) + ) + by_qname = {r["qname"]: r for r in rows} + assert "omop.person" in by_qname + assert len(by_qname["omop.person"]["card_hash"]) == 64 + assert by_qname["omop.person"]["version"] + assert by_qname["omop.person"]["description"] + + +def test_target_obligation_and_term_carry_is_current(migrated_neo4j) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + oblig = s.run( + "MATCH (n:TargetObligation) " + "RETURN n.is_current AS c LIMIT 1" + ).single() + term = s.run( + "MATCH (n:Term) RETURN n.is_current AS c LIMIT 1" + ).single() + assert oblig is not None and oblig["c"] is True + assert term is not None and term["c"] is True + + +def test_card_only_bump_creates_new_generation_and_flips_prior( + migrated_neo4j, tmp_path +) -> None: + """Spec 8.3a: a card_version bump WITHOUT a schema change must + produce a new ContextCard generation and flip the prior to + is_current=false. Pins under the prior card_version need to + re-read the exact prior content for revalidation.""" + base_loaded = load_target( + ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j) + ) + raw = yaml.safe_load(_GOLDEN.read_text()) + person = next( + e for e in raw["entities"] if e["qualified_name"] == "omop.person" + ) + person["context_card"]["card_version"] = "2.0.0" + person["context_card"]["description"] = "Bumped wording" + bumped_path = tmp_path / "bumped.yaml" + bumped_path.write_text(yaml.safe_dump(raw)) + bumped_loaded = load_target( + ManifestTargetAdapter(bumped_path), writer=_writer(migrated_neo4j) + ) + assert bumped_loaded.target_schema_snapshot_hash == base_loaded.target_schema_snapshot_hash + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (c:ContextCard {entity_qualified_name: 'omop.person'}) " + "RETURN c.card_version AS version, c.is_current AS is_current, " + "c.description AS description" + ) + ) + by_version = {r["version"]: r for r in rows} + assert "1.0.0" in by_version, by_version + assert "2.0.0" in by_version + assert by_version["1.0.0"]["is_current"] is False + assert by_version["2.0.0"]["is_current"] is True + assert by_version["1.0.0"]["description"] != by_version["2.0.0"]["description"] + + +def test_card_only_bump_has_context_card_relationship_resolves_current( + migrated_neo4j, tmp_path +) -> None: + load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + raw = yaml.safe_load(_GOLDEN.read_text()) + person = next( + e for e in raw["entities"] if e["qualified_name"] == "omop.person" + ) + person["context_card"]["card_version"] = "2.0.0" + person["context_card"]["description"] = "Bumped wording" + bumped = tmp_path / "bumped.yaml" + bumped.write_text(yaml.safe_dump(raw)) + load_target(ManifestTargetAdapter(bumped), writer=_writer(migrated_neo4j)) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (e:Entity {qualified_name: 'omop.person'})" + "-[r:HAS_CONTEXT_CARD]->(c:ContextCard) " + "RETURN c.card_version AS version" + ) + ) + versions = {r["version"] for r in rows} + assert versions == {"1.0.0", "2.0.0"} + + +def test_loaded_target_exposes_full_context_cards_in_python(migrated_neo4j) -> None: + loaded = load_target(ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j)) + qnames = {c.entity_ref.qualified_name for c in loaded.context_cards} + assert "omop.person" in qnames + assert all(c.card_hash and len(c.card_hash) == 64 for c in loaded.context_cards) + + +def test_field_map_targeting_endpoint_property_is_addressable( + migrated_neo4j, +) -> None: + """The synthesized endpoint Property is the target slot for a FieldMap. + + A FieldMap.target_field_ref points at the endpoint property by the + same hash-versioned identity tuple any other Property uses; the + integration test asserts the slot is queryable, which is what the + planner contract's plan-verdict derivation needs. + """ + loaded = load_target( + ManifestTargetAdapter(_GOLDEN), writer=_writer(migrated_neo4j) + ) + with migrated_neo4j.session() as s: + rows = list( + s.run( + "MATCH (p:Property {property_kind: 'ENDPOINT'}) " + "WHERE p.target_schema_snapshot_hash = $h " + "AND p.parent_entity_qualified_name = 'acris.OWNS' " + "AND p.name = 'subject' " + "RETURN p.id AS pid", + h=loaded.target_schema_snapshot_hash, + ) + ) + assert len(rows) == 1 + assert rows[0]["pid"] diff --git a/tests/unit/models/target/__init__.py b/tests/unit/models/target/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/models/target/conftest.py b/tests/unit/models/target/conftest.py new file mode 100644 index 0000000..791abbb --- /dev/null +++ b/tests/unit/models/target/conftest.py @@ -0,0 +1,3 @@ +"""Shared fixtures for `sema.models.target` unit tests.""" + +from __future__ import annotations diff --git a/tests/unit/models/target/test_completeness.py b/tests/unit/models/target/test_completeness.py new file mode 100644 index 0000000..1241c62 --- /dev/null +++ b/tests/unit/models/target/test_completeness.py @@ -0,0 +1,57 @@ +"""Tests for SemanticCompleteness enum and SemanticCompletenessAnnotations.""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from sema.models.target.completeness import ( + SemanticCompleteness, + SemanticCompletenessAnnotations, +) + + +pytestmark = pytest.mark.unit + + +def test_enum_values_exact_set() -> None: + assert {m.value for m in SemanticCompleteness} == { + "COMPLETE", + "PARTIAL", + "NONE", + "EXTERNAL", + } + + +def test_annotations_requires_all_five_facets() -> None: + with pytest.raises(ValidationError) as exc: + SemanticCompletenessAnnotations( # type: ignore[call-arg] + structure=SemanticCompleteness.COMPLETE, + ) + msg = str(exc.value) + for facet in ("obligations", "vocabulary_bindings", "semantic_aliases", "terms"): + assert facet in msg + + +def test_annotations_extra_field_rejected() -> None: + with pytest.raises(ValidationError): + SemanticCompletenessAnnotations( # type: ignore[call-arg] + structure=SemanticCompleteness.COMPLETE, + obligations=SemanticCompleteness.COMPLETE, + vocabulary_bindings=SemanticCompleteness.PARTIAL, + semantic_aliases=SemanticCompleteness.PARTIAL, + terms=SemanticCompleteness.EXTERNAL, + extra="boom", + ) + + +def test_annotations_frozen() -> None: + ann = SemanticCompletenessAnnotations( + structure=SemanticCompleteness.COMPLETE, + obligations=SemanticCompleteness.COMPLETE, + vocabulary_bindings=SemanticCompleteness.PARTIAL, + semantic_aliases=SemanticCompleteness.PARTIAL, + terms=SemanticCompleteness.EXTERNAL, + ) + with pytest.raises(ValidationError): + ann.structure = SemanticCompleteness.NONE # type: ignore[misc] diff --git a/tests/unit/models/target/test_context_card.py b/tests/unit/models/target/test_context_card.py new file mode 100644 index 0000000..4264e03 --- /dev/null +++ b/tests/unit/models/target/test_context_card.py @@ -0,0 +1,93 @@ +"""Tests for TargetContextCard.""" + +from __future__ import annotations + +import pytest +from packaging.version import Version +from pydantic import ValidationError + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.context_card import TargetContextCard +from sema.models.target.refs import TargetEntityRef + + +pytestmark = pytest.mark.unit + + +def _ref() -> TargetEntityRef: + return TargetEntityRef( + target_model_id="omop-cdm", + qualified_name="omop.person", + kind=TargetArtifactKind.TABLE_ROW, + ) + + +def _card(**kwargs: object) -> TargetContextCard: + base: dict[str, object] = { + "entity_ref": _ref(), + "card_version": "1.0.0", + "description": "OMOP person table", + "examples": ["A patient record with demographic columns."], + } + base.update(kwargs) + return TargetContextCard(**base) # type: ignore[arg-type] + + +def test_card_round_trip() -> None: + card = _card(curated_synonyms=["patient", "subject"]) + assert TargetContextCard.model_validate_json(card.model_dump_json()) == card + + +def test_card_default_card_hash_is_none() -> None: + card = _card() + assert card.card_hash is None + + +def test_card_rejects_non_none_card_hash_at_construction() -> None: + with pytest.raises(ValidationError): + _card(card_hash="deadbeef") + + +def test_card_rejects_64_char_hex_card_hash_at_construction() -> None: + """Adapter MUST NOT supply a card_hash, even if shaped as a 64-char + hex SHA-256 digest. Hash computation is Sema-owned via the loader.""" + with pytest.raises(ValidationError): + _card(card_hash="0" * 64) + with pytest.raises(ValidationError): + _card(card_hash="abcdef0123456789" * 4) + + +def test_card_description_must_be_non_empty() -> None: + with pytest.raises(ValidationError): + _card(description="") + + +def test_card_description_max_4000_chars() -> None: + with pytest.raises(ValidationError): + _card(description="x" * 4001) + + +@pytest.mark.parametrize( + "version", + ["1.0.0", "2.1.3b1", "0.0.0+synthesized", "1.0.0.post1"], +) +def test_card_version_accepts_pep440(version: str) -> None: + card = _card(card_version=version) + assert Version(card.card_version) == Version(version) + + +@pytest.mark.parametrize("bad_version", ["not-a-version", "0.0.0-synthesized", ""]) +def test_card_version_rejects_non_pep440(bad_version: str) -> None: + with pytest.raises(ValidationError): + _card(card_version=bad_version) + + +def test_card_extra_field_rejected() -> None: + with pytest.raises(ValidationError): + _card(extra=1) + + +def test_card_frozen() -> None: + card = _card() + with pytest.raises(ValidationError): + card.description = "x" # type: ignore[misc] diff --git a/tests/unit/models/target/test_descriptor.py b/tests/unit/models/target/test_descriptor.py new file mode 100644 index 0000000..e5d8595 --- /dev/null +++ b/tests/unit/models/target/test_descriptor.py @@ -0,0 +1,69 @@ +"""Tests for TargetModelDescriptor.""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from sema.models.target.completeness import ( + SemanticCompleteness, + SemanticCompletenessAnnotations, +) +from sema.models.target.descriptor import TargetModelDescriptor + + +pytestmark = pytest.mark.unit + + +def _annotations() -> SemanticCompletenessAnnotations: + return SemanticCompletenessAnnotations( + structure=SemanticCompleteness.COMPLETE, + obligations=SemanticCompleteness.COMPLETE, + vocabulary_bindings=SemanticCompleteness.COMPLETE, + semantic_aliases=SemanticCompleteness.PARTIAL, + terms=SemanticCompleteness.EXTERNAL, + ) + + +def test_descriptor_round_trip() -> None: + desc = TargetModelDescriptor( + target_model_id="omop-cdm", + target_model_version="5.4.0", + display_name="OMOP CDM", + owner="ohdsi", + vocabulary_release="2025-01", + completeness=_annotations(), + ) + assert TargetModelDescriptor.model_validate_json(desc.model_dump_json()) == desc + + +@pytest.mark.parametrize( + "bad_id", + [ + "OMOP-CDM", + "omop_cdm", + "1omop", + "-omop", + "omop cdm", + "omop.cdm", + ], +) +def test_descriptor_rejects_non_kebab_target_model_id(bad_id: str) -> None: + with pytest.raises(ValidationError): + TargetModelDescriptor( + target_model_id=bad_id, + target_model_version="1.0.0", + display_name="x", + completeness=_annotations(), + ) + + +def test_descriptor_extra_field_rejected() -> None: + with pytest.raises(ValidationError): + TargetModelDescriptor( # type: ignore[call-arg] + target_model_id="omop-cdm", + target_model_version="5.4.0", + display_name="OMOP CDM", + completeness=_annotations(), + extra="oops", + ) diff --git a/tests/unit/models/target/test_dto_meta.py b/tests/unit/models/target/test_dto_meta.py new file mode 100644 index 0000000..f64321e --- /dev/null +++ b/tests/unit/models/target/test_dto_meta.py @@ -0,0 +1,41 @@ +"""Meta-test: every target DTO is `extra='forbid'` + `frozen=True`.""" + +from __future__ import annotations + +import importlib +import inspect +import pkgutil + +import pytest +from pydantic import BaseModel + +import sema.models.target as target_pkg + + +pytestmark = pytest.mark.unit + + +def _iter_dto_classes() -> list[type[BaseModel]]: + classes: list[type[BaseModel]] = [] + for mod_info in pkgutil.iter_modules(target_pkg.__path__): + mod = importlib.import_module(f"{target_pkg.__name__}.{mod_info.name}") + for _, obj in inspect.getmembers(mod, inspect.isclass): + if obj.__module__ != mod.__name__: + continue + if not issubclass(obj, BaseModel): + continue + classes.append(obj) + return classes + + +def test_every_target_dto_has_extra_forbid_and_frozen() -> None: + classes = _iter_dto_classes() + assert classes, "expected at least one DTO under sema.models.target" + failures: list[str] = [] + for cls in classes: + config = cls.model_config + if config.get("extra") != "forbid": + failures.append(f"{cls.__module__}.{cls.__name__}: extra != forbid") + if not config.get("frozen"): + failures.append(f"{cls.__module__}.{cls.__name__}: frozen != True") + assert not failures, "\n".join(failures) diff --git a/tests/unit/models/target/test_endpoints.py b/tests/unit/models/target/test_endpoints.py new file mode 100644 index 0000000..83c1f5a --- /dev/null +++ b/tests/unit/models/target/test_endpoints.py @@ -0,0 +1,31 @@ +"""Tests for EdgeEndpointDecl and EdgeEndpointsDecl.""" + +from __future__ import annotations + +import pytest + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.endpoints import EdgeEndpointDecl, EdgeEndpointsDecl +from sema.models.target.refs import TargetEntityRef + + +pytestmark = pytest.mark.unit + + +def _ref(name: str, kind: TargetArtifactKind = TargetArtifactKind.GRAPH_NODE) -> TargetEntityRef: + return TargetEntityRef(target_model_id="acris", qualified_name=f"acris.{name}", kind=kind) + + +def test_endpoint_decl_defaults() -> None: + e = EdgeEndpointDecl(role="subject", target_entity=_ref("LLC")) + assert e.cardinality == "one" + assert e.nullable is False + + +def test_endpoints_decl_round_trip() -> None: + eds = EdgeEndpointsDecl( + subject=EdgeEndpointDecl(role="subject", target_entity=_ref("LLC")), + object=EdgeEndpointDecl(role="object", target_entity=_ref("Property")), + ) + blob = eds.model_dump_json() + assert EdgeEndpointsDecl.model_validate_json(blob) == eds diff --git a/tests/unit/models/target/test_entity_decl.py b/tests/unit/models/target/test_entity_decl.py new file mode 100644 index 0000000..0841c9b --- /dev/null +++ b/tests/unit/models/target/test_entity_decl.py @@ -0,0 +1,79 @@ +"""Tests for TargetEntityDecl + endpoints/kind invariants.""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.endpoints import EdgeEndpointDecl, EdgeEndpointsDecl +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.properties import TargetPropertyDecl +from sema.models.target.refs import TargetEntityRef + + +pytestmark = pytest.mark.unit + + +def _ref(name: str, kind: TargetArtifactKind) -> TargetEntityRef: + return TargetEntityRef(target_model_id="acris", qualified_name=name, kind=kind) + + +def _columnar_property(name: str = "id") -> TargetPropertyDecl: + return TargetPropertyDecl(name=name, type="int", nullable=False) + + +def _endpoints() -> EdgeEndpointsDecl: + return EdgeEndpointsDecl( + subject=EdgeEndpointDecl( + role="subject", + target_entity=_ref("acris.LLC", TargetArtifactKind.GRAPH_NODE), + ), + object=EdgeEndpointDecl( + role="object", + target_entity=_ref("acris.Property", TargetArtifactKind.GRAPH_NODE), + ), + ) + + +def test_table_row_entity_no_endpoints() -> None: + e = TargetEntityDecl( + ref=_ref("omop.person", TargetArtifactKind.TABLE_ROW), + properties=[_columnar_property()], + ) + assert e.endpoints is None + + +def test_graph_edge_entity_requires_endpoints() -> None: + with pytest.raises(ValidationError): + TargetEntityDecl( + ref=_ref("acris.OWNS", TargetArtifactKind.GRAPH_EDGE), + properties=[_columnar_property("valid_from")], + ) + + +def test_table_row_entity_rejects_endpoints() -> None: + with pytest.raises(ValidationError): + TargetEntityDecl( + ref=_ref("omop.person", TargetArtifactKind.TABLE_ROW), + properties=[_columnar_property()], + endpoints=_endpoints(), + ) + + +def test_graph_node_entity_rejects_endpoints() -> None: + with pytest.raises(ValidationError): + TargetEntityDecl( + ref=_ref("acris.LLC", TargetArtifactKind.GRAPH_NODE), + properties=[_columnar_property()], + endpoints=_endpoints(), + ) + + +def test_graph_edge_entity_round_trip() -> None: + e = TargetEntityDecl( + ref=_ref("acris.OWNS", TargetArtifactKind.GRAPH_EDGE), + properties=[_columnar_property("valid_from")], + endpoints=_endpoints(), + ) + assert TargetEntityDecl.model_validate_json(e.model_dump_json()) == e diff --git a/tests/unit/models/target/test_obligation_reexport.py b/tests/unit/models/target/test_obligation_reexport.py new file mode 100644 index 0000000..e8a3b41 --- /dev/null +++ b/tests/unit/models/target/test_obligation_reexport.py @@ -0,0 +1,15 @@ +"""TargetObligationDecl is the same class as planner.TargetObligation.""" + +from __future__ import annotations + +import pytest + +from sema.models.planner.target_model import TargetObligation +from sema.models.target.obligation import TargetObligationDecl + + +pytestmark = pytest.mark.unit + + +def test_target_obligation_decl_is_same_class() -> None: + assert TargetObligationDecl is TargetObligation diff --git a/tests/unit/models/target/test_property_decl.py b/tests/unit/models/target/test_property_decl.py new file mode 100644 index 0000000..f90637b --- /dev/null +++ b/tests/unit/models/target/test_property_decl.py @@ -0,0 +1,68 @@ +"""Tests for TargetPropertyDecl + endpoint property fields.""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from sema.models.target.properties import PropertyKind, TargetPropertyDecl + + +pytestmark = pytest.mark.unit + + +def test_columnar_property_minimum_fields() -> None: + p = TargetPropertyDecl(name="person_id", type="int", nullable=False) + assert p.property_kind is PropertyKind.COLUMN + assert p.endpoint_role is None + assert p.endpoint_target_entity_qualified_name is None + assert p.materialized_as_edge_property is True + + +def test_columnar_property_round_trip() -> None: + p = TargetPropertyDecl( + name="gender_concept_id", + type="int", + nullable=False, + synonyms=["gender", "sex"], + decoded_values={"8507": "MALE", "8532": "FEMALE"}, + ) + assert TargetPropertyDecl.model_validate_json(p.model_dump_json()) == p + + +@pytest.mark.parametrize("reserved", ["subject", "object"]) +def test_columnar_property_rejects_reserved_endpoint_names(reserved: str) -> None: + with pytest.raises(ValidationError): + TargetPropertyDecl(name=reserved, type="string", nullable=False) + + +def test_endpoint_kind_permits_reserved_names_for_normalizer_use() -> None: + p = TargetPropertyDecl( + name="subject", + type="entity_ref", + nullable=False, + property_kind=PropertyKind.ENDPOINT, + endpoint_role="subject", + endpoint_target_entity_qualified_name="acris.LLC", + endpoint_cardinality="one", + endpoint_nullable=False, + materialized_as_edge_property=False, + ) + assert p.property_kind is PropertyKind.ENDPOINT + assert p.endpoint_role == "subject" + + +def test_property_extra_field_rejected() -> None: + with pytest.raises(ValidationError): + TargetPropertyDecl( # type: ignore[call-arg] + name="x", + type="int", + nullable=True, + wat=1, + ) + + +def test_property_frozen() -> None: + p = TargetPropertyDecl(name="x", type="int", nullable=False) + with pytest.raises(ValidationError): + p.name = "y" # type: ignore[misc] diff --git a/tests/unit/models/target/test_refs.py b/tests/unit/models/target/test_refs.py new file mode 100644 index 0000000..f918fbe --- /dev/null +++ b/tests/unit/models/target/test_refs.py @@ -0,0 +1,62 @@ +"""Tests for target-side refs (TargetEntityRef, TargetPropertyRef, VocabularyRef).""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.refs import ( + TargetEntityRef, + TargetPropertyRef, + VocabularyRef, + VocabularySource, +) + + +pytestmark = pytest.mark.unit + + +def _entity_ref() -> TargetEntityRef: + return TargetEntityRef( + target_model_id="omop-cdm", + qualified_name="omop.person", + kind=TargetArtifactKind.TABLE_ROW, + ) + + +def test_entity_ref_round_trip() -> None: + ref = _entity_ref() + blob = ref.model_dump_json() + assert TargetEntityRef.model_validate_json(blob) == ref + + +def test_entity_ref_qualified_name_must_be_dotted() -> None: + with pytest.raises(ValidationError): + TargetEntityRef( + target_model_id="omop-cdm", + qualified_name="person", + kind=TargetArtifactKind.TABLE_ROW, + ) + + +def test_property_ref_extra_field_rejected() -> None: + with pytest.raises(ValidationError): + TargetPropertyRef( # type: ignore[call-arg] + entity_ref=_entity_ref(), + property_name="person_id", + extra=1, + ) + + +def test_property_ref_frozen() -> None: + pr = TargetPropertyRef(entity_ref=_entity_ref(), property_name="person_id") + with pytest.raises(ValidationError): + pr.property_name = "x" # type: ignore[misc] + + +def test_vocabulary_ref_source_enum() -> None: + ext = VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL) + inline = VocabularyRef(name="local-list", source=VocabularySource.INLINE) + assert ext.source.value == "EXTERNAL" + assert inline.source.value == "INLINE" diff --git a/tests/unit/models/target/test_term_decl.py b/tests/unit/models/target/test_term_decl.py new file mode 100644 index 0000000..e264bb0 --- /dev/null +++ b/tests/unit/models/target/test_term_decl.py @@ -0,0 +1,30 @@ +"""Tests for TargetTermDecl.""" + +from __future__ import annotations + +import pytest + +from sema.models.target.refs import VocabularyRef, VocabularySource +from sema.models.target.term import TargetTermDecl + + +pytestmark = pytest.mark.unit + + +def test_term_round_trip() -> None: + t = TargetTermDecl( + vocabulary=VocabularyRef(name="local-list", source=VocabularySource.INLINE), + code="MALE", + display="Male", + domain="Gender", + ) + assert TargetTermDecl.model_validate_json(t.model_dump_json()) == t + + +def test_term_minimum_fields() -> None: + t = TargetTermDecl( + vocabulary=VocabularyRef(name="local-list", source=VocabularySource.INLINE), + code="MALE", + display="Male", + ) + assert t.domain is None diff --git a/tests/unit/models/target/test_vocab_binding.py b/tests/unit/models/target/test_vocab_binding.py new file mode 100644 index 0000000..38c012b --- /dev/null +++ b/tests/unit/models/target/test_vocab_binding.py @@ -0,0 +1,56 @@ +"""Tests for VocabularyBindingDecl.""" + +from __future__ import annotations + +import pytest +from pydantic import ValidationError + +from sema.models.planner._enums import TargetArtifactKind +from sema.models.target.refs import TargetEntityRef, VocabularyRef, VocabularySource +from sema.models.target.vocab_binding import VocabularyBindingDecl + + +pytestmark = pytest.mark.unit + + +def _entity_ref() -> TargetEntityRef: + return TargetEntityRef( + target_model_id="omop-cdm", + qualified_name="omop.person", + kind=TargetArtifactKind.TABLE_ROW, + ) + + +def test_minimal_binding() -> None: + vb = VocabularyBindingDecl( + entity_ref=_entity_ref(), + property_name="gender_concept_id", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + ) + assert vb.require_standard is False + assert vb.allow_zero_default is False + assert vb.domain is None + + +def test_binding_round_trip_with_all_hooks() -> None: + vb = VocabularyBindingDecl( + entity_ref=_entity_ref(), + property_name="gender_concept_id", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + domain="Gender", + require_standard=True, + allow_zero_default=True, + effective_date_ref="omop.person.start_date", + resolver_policy_ref="omop.policy.standard_only", + ) + assert VocabularyBindingDecl.model_validate_json(vb.model_dump_json()) == vb + + +def test_binding_extra_field_rejected() -> None: + with pytest.raises(ValidationError): + VocabularyBindingDecl( # type: ignore[call-arg] + entity_ref=_entity_ref(), + property_name="x", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + mystery=1, + ) diff --git a/tests/unit/targets/__init__.py b/tests/unit/targets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/targets/conftest.py b/tests/unit/targets/conftest.py new file mode 100644 index 0000000..1c0fba4 --- /dev/null +++ b/tests/unit/targets/conftest.py @@ -0,0 +1,280 @@ +"""Shared fixtures for `sema.targets` unit tests.""" + +from __future__ import annotations + +from collections.abc import Iterable, Iterator + +import pytest + +from sema.models.planner._enums import ( + PrimaryKeyStrategy, + TargetArtifactKind, +) +from sema.models.target.completeness import ( + SemanticCompleteness, + SemanticCompletenessAnnotations, +) +from sema.models.target.context_card import TargetContextCard +from sema.models.target.descriptor import TargetModelDescriptor +from sema.models.target.endpoints import EdgeEndpointDecl, EdgeEndpointsDecl +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import TargetPropertyDecl +from sema.models.target.refs import ( + TargetEntityRef, + TargetPropertyRef, + VocabularyRef, + VocabularySource, +) +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets import register_target_adapter +from sema.targets.registry import _clear_for_tests + + +def _make_descriptor( + target_model_id: str = "fake-target", + completeness: SemanticCompletenessAnnotations | None = None, +) -> TargetModelDescriptor: + return TargetModelDescriptor( + target_model_id=target_model_id, + target_model_version="1.0.0", + display_name="Fake Target", + completeness=completeness or default_completeness(), + ) + + +def default_completeness() -> SemanticCompletenessAnnotations: + return SemanticCompletenessAnnotations( + structure=SemanticCompleteness.COMPLETE, + obligations=SemanticCompleteness.COMPLETE, + vocabulary_bindings=SemanticCompleteness.PARTIAL, + semantic_aliases=SemanticCompleteness.PARTIAL, + terms=SemanticCompleteness.EXTERNAL, + ) + + +def _make_entity_ref( + target_model_id: str = "fake-target", + qualified_name: str = "fake.person", + kind: TargetArtifactKind = TargetArtifactKind.TABLE_ROW, +) -> TargetEntityRef: + return TargetEntityRef( + target_model_id=target_model_id, qualified_name=qualified_name, kind=kind + ) + + +class FakeAdapter: + """Minimal conforming adapter used across protocol/registry tests.""" + + def __init__(self, target_model_id: str = "fake-target") -> None: + self._descriptor = _make_descriptor(target_model_id) + self._entity_ref = _make_entity_ref(target_model_id=target_model_id) + self._entity_decl = TargetEntityDecl( + ref=self._entity_ref, + properties=[TargetPropertyDecl(name="person_id", type="string", nullable=False)], + ) + self._obligation = TargetObligationDecl( + target_entity=self._entity_ref.qualified_name, + required_fields=["person_id"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + ) + + def describe(self) -> TargetModelDescriptor: + return self._descriptor + + def discover_entities(self) -> Iterable[TargetEntityRef]: + return [self._entity_ref] + + def load_entity(self, ref: TargetEntityRef) -> TargetEntityDecl: + return self._entity_decl + + def load_obligation(self, ref: TargetEntityRef) -> TargetObligationDecl: + return self._obligation + + def load_vocabulary_bindings( + self, ref: TargetPropertyRef + ) -> Iterable[VocabularyBindingDecl]: + return [] + + def load_context_card(self, ref: TargetEntityRef) -> TargetContextCard: + return TargetContextCard( + entity_ref=ref, + card_version="1.0.0", + description=f"Fake card for {ref.qualified_name}", + ) + + def iter_terms(self, vocabulary_ref: VocabularyRef) -> Iterator[TargetTermDecl]: + raise NotImplementedError("EXTERNAL terms; adapter does not inline") + + +class ScriptedAdapter: + """Adapter built from explicit DTO collections, for normalizer/loader tests.""" + + def __init__( + self, + descriptor: TargetModelDescriptor, + entities: list[TargetEntityDecl], + obligations: list[TargetObligationDecl], + bindings: list[VocabularyBindingDecl] | None = None, + terms: list[TargetTermDecl] | None = None, + cards: list[TargetContextCard] | None = None, + ) -> None: + self._descriptor = descriptor + self._entities = list(entities) + self._entities_by_ref = {e.ref: e for e in entities} + self._obligations = {o.target_entity: o for o in obligations} + self._bindings = list(bindings or []) + self._terms = list(terms or []) + self._cards = list(cards or []) + self._cards_by_ref = {c.entity_ref: c for c in self._cards} + + def describe(self) -> TargetModelDescriptor: + return self._descriptor + + def discover_entities(self) -> Iterable[TargetEntityRef]: + return [e.ref for e in self._entities] + + def load_entity(self, ref: TargetEntityRef) -> TargetEntityDecl: + return self._entities_by_ref[ref] + + def load_obligation(self, ref: TargetEntityRef) -> TargetObligationDecl: + return self._obligations[ref.qualified_name] + + def load_vocabulary_bindings( + self, ref: TargetPropertyRef + ) -> Iterable[VocabularyBindingDecl]: + return [ + b + for b in self._bindings + if b.entity_ref == ref.entity_ref and b.property_name == ref.property_name + ] + + def load_context_card(self, ref: TargetEntityRef) -> TargetContextCard: + if ref in self._cards_by_ref: + return self._cards_by_ref[ref] + return TargetContextCard( + entity_ref=ref, + card_version="0.0.0+synthesized", + description=f"Auto-generated card for {ref.qualified_name}.", + ) + + def iter_terms(self, vocabulary_ref: VocabularyRef) -> Iterator[TargetTermDecl]: + matching = [t for t in self._terms if t.vocabulary.name == vocabulary_ref.name] + if not matching: + raise NotImplementedError( + f"adapter does not inline terms for vocabulary {vocabulary_ref.name!r}" + ) + return iter(matching) + + +def make_table_row_entity( + target_model_id: str = "fake-target", + qualified_name: str = "fake.person", + properties: list[TargetPropertyDecl] | None = None, + completeness: SemanticCompletenessAnnotations | None = None, +) -> TargetEntityDecl: + ref = _make_entity_ref( + target_model_id=target_model_id, + qualified_name=qualified_name, + kind=TargetArtifactKind.TABLE_ROW, + ) + return TargetEntityDecl( + ref=ref, + properties=properties or [TargetPropertyDecl(name="person_id", type="string", nullable=False)], + completeness=completeness, + ) + + +def make_graph_node_entity( + target_model_id: str = "fake-target", + qualified_name: str = "fake.LLC", + properties: list[TargetPropertyDecl] | None = None, +) -> TargetEntityDecl: + ref = _make_entity_ref( + target_model_id=target_model_id, + qualified_name=qualified_name, + kind=TargetArtifactKind.GRAPH_NODE, + ) + return TargetEntityDecl( + ref=ref, + properties=properties or [TargetPropertyDecl(name="name", type="string", nullable=False)], + ) + + +def make_graph_edge_entity( + target_model_id: str, + qualified_name: str, + subject_target: TargetEntityRef, + object_target: TargetEntityRef, + columnar_properties: list[TargetPropertyDecl] | None = None, +) -> TargetEntityDecl: + ref = _make_entity_ref( + target_model_id=target_model_id, + qualified_name=qualified_name, + kind=TargetArtifactKind.GRAPH_EDGE, + ) + endpoints = EdgeEndpointsDecl( + subject=EdgeEndpointDecl(role="subject", target_entity=subject_target), + object=EdgeEndpointDecl(role="object", target_entity=object_target), + ) + return TargetEntityDecl( + ref=ref, + properties=columnar_properties or [], + endpoints=endpoints, + ) + + +@pytest.fixture +def fake_adapter_cls() -> type[FakeAdapter]: + return FakeAdapter + + +@pytest.fixture(autouse=True) +def _isolate_registry() -> Iterator[None]: + _clear_for_tests() + yield + _clear_for_tests() + + +def make_descriptor( + target_model_id: str = "fake-target", + completeness: SemanticCompletenessAnnotations | None = None, +) -> TargetModelDescriptor: + return _make_descriptor(target_model_id, completeness=completeness) + + +def make_entity_ref( + target_model_id: str = "fake-target", + qualified_name: str = "fake.person", + kind: TargetArtifactKind = TargetArtifactKind.TABLE_ROW, +) -> TargetEntityRef: + return _make_entity_ref(target_model_id, qualified_name, kind) + + +def make_obligation( + target_entity: str = "fake.person", + required_fields: list[str] | None = None, + pk: PrimaryKeyStrategy = PrimaryKeyStrategy.NATURAL_KEY, +) -> TargetObligationDecl: + return TargetObligationDecl( + target_entity=target_entity, + required_fields=required_fields or ["person_id"], + primary_key=pk, + ) + + +__all__ = [ + "FakeAdapter", + "ScriptedAdapter", + "make_descriptor", + "make_entity_ref", + "make_obligation", + "make_table_row_entity", + "make_graph_node_entity", + "make_graph_edge_entity", + "default_completeness", + "register_target_adapter", + "VocabularyRef", + "VocabularySource", +] diff --git a/tests/unit/targets/fixtures/golden_manifest.yaml b/tests/unit/targets/fixtures/golden_manifest.yaml new file mode 100644 index 0000000..05aebb0 --- /dev/null +++ b/tests/unit/targets/fixtures/golden_manifest.yaml @@ -0,0 +1,162 @@ +manifest_version: 1 +descriptor: + target_model_id: golden-target + target_model_version: "1.0.0" + display_name: Golden Compliance Target + owner: ontology-team@example.com + vocabulary_release: golden-vocab-2026-q1 +vocabularies: + - name: GENDER_CV + source: INLINE + - name: SNOMED + source: EXTERNAL +terms: + - vocabulary: GENDER_CV + code: M + display: Male + - vocabulary: GENDER_CV + code: F + display: Female +entities: + - qualified_name: omop.person + kind: TABLE_ROW + properties: + - name: person_id + type: integer + nullable: false + - name: gender_concept_id + type: integer + nullable: false + synonyms: [sex, gender_identity] + vocabulary_binding: + vocabulary: GENDER_CV + domain: Gender + require_standard: true + obligation: + required_fields: [person_id, gender_concept_id] + primary_key: NATURAL_KEY + domain_constraints: + - property_name: gender_concept_id + domain_id: Gender + context_card: + card_version: "1.0.0" + description: OMOP person table; one row per individual. + examples: ["Sample person row"] + obligation_summary: Required PK plus gender concept. + - qualified_name: omop.observation + kind: TABLE_ROW + properties: + - name: observation_id + type: integer + nullable: false + - name: person_id + type: integer + nullable: false + - name: source_value + type: string + nullable: true + vocabulary_binding: + vocabulary: SNOMED + domain: Observation + obligation: + required_fields: [observation_id, person_id] + primary_key: NATURAL_KEY + foreign_keys: + - referenced_entity: omop.person + join_keys: [["person_id", "person_id"]] + context_card: + card_version: "1.0.0" + description: OMOP observation table; one row per observation event. + - qualified_name: omop.measurement + kind: TABLE_ROW + properties: + - name: measurement_id + type: integer + nullable: false + - name: person_id + type: integer + nullable: false + - name: value_as_number + type: double + nullable: true + obligation: + required_fields: [measurement_id, person_id] + primary_key: NATURAL_KEY + foreign_keys: + - referenced_entity: omop.person + join_keys: [["person_id", "person_id"]] + - qualified_name: acris.LLC + kind: GRAPH_NODE + properties: + - name: name + type: string + nullable: false + obligation: + required_fields: [name] + primary_key: NATURAL_KEY + - qualified_name: acris.Property + kind: GRAPH_NODE + properties: + - name: address + type: string + nullable: false + obligation: + required_fields: [address] + primary_key: NATURAL_KEY + - qualified_name: match.Identity + kind: GRAPH_NODE + properties: + - name: name + type: string + nullable: false + obligation: + required_fields: [name] + primary_key: NATURAL_KEY + - qualified_name: acris.OWNS + kind: GRAPH_EDGE + endpoints: + subject: + target_entity: acris.LLC + cardinality: one + nullable: false + object: + target_entity: acris.Property + cardinality: one + nullable: false + properties: + - name: valid_from + type: date + nullable: false + - name: valid_to + type: date + nullable: true + obligation: + required_fields: [subject, object, valid_from] + primary_key: NATURAL_KEY + minimum_viable_row: + op: AND + clauses: + - kind: presence + field: subject + - kind: presence + field: object + - kind: presence + field: valid_from + context_card: + card_version: "1.0.0" + description: Ownership edge between LLC and Property with temporal validity. + - qualified_name: match.SAME_AS + kind: GRAPH_EDGE + endpoints: + subject: + target_entity: match.Identity + cardinality: one + nullable: false + object: + target_entity: match.Identity + cardinality: one + nullable: false + properties: [] + obligation: + required_fields: [subject, object] + primary_key: NATURAL_KEY diff --git a/tests/unit/targets/fixtures/golden_manifest_hash.txt b/tests/unit/targets/fixtures/golden_manifest_hash.txt new file mode 100644 index 0000000..f2e3c95 --- /dev/null +++ b/tests/unit/targets/fixtures/golden_manifest_hash.txt @@ -0,0 +1 @@ +2f0e4a92d3cbf3213eeb892e45c7abfc67ce4bb896933fa7e7d8ea76f0e0c445 diff --git a/tests/unit/targets/test_cross_capability.py b/tests/unit/targets/test_cross_capability.py new file mode 100644 index 0000000..fd0d50a --- /dev/null +++ b/tests/unit/targets/test_cross_capability.py @@ -0,0 +1,172 @@ +"""Cross-capability integration tests using InMemoryGraphWriter. + +Mirrors the unit-testable subset of Section 8: provenance integration, +pin invalidation, lazy subset, multi-adapter, supplied vs deferred, +operator skip-facets. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +import pytest +import yaml + +from sema.models.planner.lifecycle import HumanPin, PinState +from sema.models.planner.lifecycle_utils import detect_pin_stale +from sema.models.planner.provenance import RunProvenance, SourceScope +from sema.models.target.enrichment import EnrichmentStatus, Facet +from sema.targets.adapters.manifest import ManifestTargetAdapter +from sema.targets.loader import load_target +from sema.targets.materializer import InMemoryGraphWriter + +pytestmark = pytest.mark.unit + +_FIXTURES = Path(__file__).parent / "fixtures" +GOLDEN = _FIXTURES / "golden_manifest.yaml" + + +def _build_run(snapshot_hash: str, card_version: str = "1.0.0") -> RunProvenance: + return RunProvenance( + run_id="run-1", + target_model_version="1.0.0", + target_schema_snapshot_hash=snapshot_hash, + context_card_version=card_version, + prompt_template_version="prompt-v1", + few_shot_set_version="fs-v1", + constraint_version="cv-v1", + llm_model="model-v1", + ) + + +def _build_source_scope() -> SourceScope: + return SourceScope( + source_id="src-1", + source_schema_hash="srchash" * 9 + "x", + source_profile_hash="profh" * 12 + "x", + ) + + +def test_loaded_target_hash_feeds_run_provenance() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + run = _build_run(loaded.target_schema_snapshot_hash) + assert run.target_schema_snapshot_hash == loaded.target_schema_snapshot_hash + + +def test_pin_goes_stale_on_schema_hash_drift(tmp_path: Path) -> None: + base = load_target(ManifestTargetAdapter(GOLDEN), writer=InMemoryGraphWriter()) + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][0]["properties"][0]["type"] = "string" + drifted_path = tmp_path / "drifted.yaml" + drifted_path.write_text(yaml.safe_dump(raw)) + drifted = load_target( + ManifestTargetAdapter(drifted_path), writer=InMemoryGraphWriter() + ) + pin = HumanPin( + pin_id="pin-1", + assertion_id="a-1", + pinned_at=datetime.now(tz=timezone.utc), + pinned_by="reviewer", + confirmed_under_run=_build_run(base.target_schema_snapshot_hash), + confirmed_under_source=_build_source_scope(), + ) + new_run = _build_run(drifted.target_schema_snapshot_hash) + updated = detect_pin_stale(pin, new_run, _build_source_scope()) + assert updated.pin_state is PinState.stale + + +def test_pin_goes_stale_on_card_version_drift_only(tmp_path: Path) -> None: + base = load_target(ManifestTargetAdapter(GOLDEN), writer=InMemoryGraphWriter()) + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][0]["context_card"]["card_version"] = "2.0.0" + raw["entities"][0]["context_card"]["description"] = "Bumped wording" + bumped_path = tmp_path / "bumped.yaml" + bumped_path.write_text(yaml.safe_dump(raw)) + bumped = load_target( + ManifestTargetAdapter(bumped_path), writer=InMemoryGraphWriter() + ) + assert base.target_schema_snapshot_hash == bumped.target_schema_snapshot_hash + assert ( + base.aggregate_context_card_version != bumped.aggregate_context_card_version + ) + pin = HumanPin( + pin_id="pin-1", + assertion_id="a-1", + pinned_at=datetime.now(tz=timezone.utc), + pinned_by="reviewer", + confirmed_under_run=_build_run( + base.target_schema_snapshot_hash, base.aggregate_context_card_version + ), + confirmed_under_source=_build_source_scope(), + ) + new_run = _build_run( + bumped.target_schema_snapshot_hash, bumped.aggregate_context_card_version + ) + updated = detect_pin_stale(pin, new_run, _build_source_scope()) + assert updated.pin_state is PinState.stale + + +def test_lazy_subset_materializes_only_selected_entities() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + refs = {r.qualified_name: r for r in adapter.discover_entities()} + selected = [refs["omop.person"], refs["omop.observation"]] + writer = InMemoryGraphWriter() + loaded = load_target(adapter, writer=writer, selected_refs=selected) + materialized = { + r.qualified_name for r in loaded.entity_refs + } + assert materialized == {"omop.person", "omop.observation"} + + +def test_lazy_subset_hash_is_stable_across_reruns() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + refs = {r.qualified_name: r for r in adapter.discover_entities()} + selected = [refs["omop.person"]] + a = load_target(adapter, writer=InMemoryGraphWriter(), selected_refs=selected) + b = load_target(adapter, writer=InMemoryGraphWriter(), selected_refs=selected) + assert a.target_schema_snapshot_hash == b.target_schema_snapshot_hash + + +def test_multi_adapter_no_collisions(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + raw["descriptor"]["target_model_id"] = "second-target" + second_path = tmp_path / "second.yaml" + second_path.write_text(yaml.safe_dump(raw)) + a = load_target(ManifestTargetAdapter(GOLDEN), writer=InMemoryGraphWriter()) + b = load_target(ManifestTargetAdapter(second_path), writer=InMemoryGraphWriter()) + assert a.descriptor.target_model_id != b.descriptor.target_model_id + assert a.target_schema_snapshot_hash != b.target_schema_snapshot_hash + + +def test_supplied_versus_deferred_enrichment() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + by_qname = {d.entity_ref.qualified_name: d for d in loaded.enrichment_decisions} + person = by_qname["omop.person"] + measurement = by_qname["omop.measurement"] + assert ( + person.decisions[Facet.semantic_aliases].status + is EnrichmentStatus.supplied_by_adapter + ) + assert ( + measurement.decisions[Facet.semantic_aliases].status + is EnrichmentStatus.required_deferred + ) + + +def test_operator_skip_facets_overrides_other_decisions() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + loaded = load_target( + adapter, + writer=InMemoryGraphWriter(), + skip_facets=["semantic_aliases"], + ) + for record in loaded.enrichment_decisions: + status = record.decisions[Facet.semantic_aliases].status + assert status in { + EnrichmentStatus.required_skipped, + EnrichmentStatus.supplied_by_adapter, + EnrichmentStatus.not_required, + } diff --git a/tests/unit/targets/test_golden_manifest.py b/tests/unit/targets/test_golden_manifest.py new file mode 100644 index 0000000..7353222 --- /dev/null +++ b/tests/unit/targets/test_golden_manifest.py @@ -0,0 +1,239 @@ +"""Golden compliance fixture: snapshot hashing, materialization, round-trip.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pytest +import yaml + +from sema.models.target.enrichment import EnrichmentStatus, Facet +from sema.targets.adapters.manifest import ManifestTargetAdapter +from sema.targets.adapters.manifest_exceptions import ManifestEndpointError +from sema.targets.exceptions import DanglingRefError +from sema.targets.hashing import SnapshotHasher +from sema.targets.loader import load_target +from sema.targets.materializer import InMemoryGraphWriter +from sema.targets.materializer_ops import ( + EntityOp, + PropertyOp, + TargetObligationOp, +) +from sema.targets.normalizer import TargetModelNormalizer + +pytestmark = pytest.mark.unit + +_FIXTURES = Path(__file__).parent / "fixtures" +GOLDEN = _FIXTURES / "golden_manifest.yaml" +PINNED_HASH_FILE = _FIXTURES / "golden_manifest_hash.txt" + + +def _pinned_hash() -> str: + return PINNED_HASH_FILE.read_text().strip() + + +def _normalize(path: Path): + adapter = ManifestTargetAdapter(path) + return TargetModelNormalizer.normalize(adapter) + + +def test_pinned_digest_matches() -> None: + digest = SnapshotHasher.hash(_normalize(GOLDEN)) + assert digest == _pinned_hash() + + +def test_two_loads_produce_same_hash() -> None: + h1 = SnapshotHasher.hash(_normalize(GOLDEN)) + h2 = SnapshotHasher.hash(_normalize(GOLDEN)) + assert h1 == h2 + + +def test_permuted_entity_order_yields_same_hash(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + permuted = dict(raw) + permuted["entities"] = list(reversed(raw["entities"])) + permuted_path = tmp_path / "permuted_manifest.yaml" + permuted_path.write_text(yaml.safe_dump(permuted)) + h_original = SnapshotHasher.hash(_normalize(GOLDEN)) + h_permuted = SnapshotHasher.hash(_normalize(permuted_path)) + assert h_original == h_permuted + + +def test_mutating_property_type_changes_hash(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][0]["properties"][0]["type"] = "string" + mutated_path = tmp_path / "mutated_manifest.yaml" + mutated_path.write_text(yaml.safe_dump(raw)) + h_original = SnapshotHasher.hash(_normalize(GOLDEN)) + h_mutated = SnapshotHasher.hash(_normalize(mutated_path)) + assert h_original != h_mutated + + +def test_model_role_target_on_every_materialized_node() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + target_model_id = adapter.parsed_manifest.descriptor.target_model_id + target_model_version = adapter.parsed_manifest.descriptor.target_model_version + for op in writer.ops: + prov = getattr(op, "target_model_id", None) + if prov is not None: + assert prov == target_model_id + assert op.target_model_version == target_model_version # type: ignore[union-attr] + + +def test_obligation_round_trip_against_writer() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + obligation_ops = { + op.target_entity: op + for op in writer.ops + if isinstance(op, TargetObligationOp) + } + assert "omop.person" in obligation_ops + person = obligation_ops["omop.person"] + assert "person_id" in person.payload["required_fields"] + assert "gender_concept_id" in person.payload["required_fields"] + owns = obligation_ops["acris.OWNS"] + minimum = owns.payload["minimum_viable_row"] + fields = sorted(c["field"] for c in minimum["clauses"]) + assert fields == ["object", "subject", "valid_from"] + + +def test_invalid_property_ref_rejected(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][0]["obligation"]["required_fields"].append("nonexistent_field") + bad_path = tmp_path / "bad_property_ref.yaml" + bad_path.write_text(yaml.safe_dump(raw)) + adapter = ManifestTargetAdapter(bad_path) + with pytest.raises(DanglingRefError, match="nonexistent_field"): + TargetModelNormalizer.normalize(adapter) + + +def test_invalid_vocabulary_ref_rejected(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][0]["properties"][1]["vocabulary_binding"] = { + "vocabulary": "GENDER_CV_TYPO" + } + bad_path = tmp_path / "bad_vocab_ref.yaml" + bad_path.write_text(yaml.safe_dump(raw)) + adapter = ManifestTargetAdapter(bad_path) + with pytest.raises(DanglingRefError, match="GENDER_CV_TYPO"): + TargetModelNormalizer.normalize(adapter) + + +def test_invalid_fk_ref_rejected(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][1]["obligation"]["foreign_keys"] = [ + { + "referenced_entity": "omop.absent", + "join_keys": [["person_id", "person_id"]], + } + ] + bad_path = tmp_path / "bad_fk_ref.yaml" + bad_path.write_text(yaml.safe_dump(raw)) + adapter = ManifestTargetAdapter(bad_path) + with pytest.raises(DanglingRefError, match="omop.absent"): + TargetModelNormalizer.normalize(adapter) + + +def test_endpoint_synthesis_for_temporal_and_non_temporal_edges() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + endpoint_ops_owns = [ + op + for op in writer.ops + if isinstance(op, PropertyOp) + and op.parent_entity_qualified_name == "acris.OWNS" + and op.property_kind == "ENDPOINT" + ] + endpoint_ops_same_as = [ + op + for op in writer.ops + if isinstance(op, PropertyOp) + and op.parent_entity_qualified_name == "match.SAME_AS" + and op.property_kind == "ENDPOINT" + ] + assert {p.name for p in endpoint_ops_owns} == {"subject", "object"} + assert {p.name for p in endpoint_ops_same_as} == {"subject", "object"} + for op in endpoint_ops_owns + endpoint_ops_same_as: + assert op.materialized_as_edge_property is False + + +def test_vocabulary_binding_round_trip() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + normalized = TargetModelNormalizer.normalize(adapter) + bindings_by_prop = { + (b.entity_ref.qualified_name, b.property_name): b + for b in normalized.vocabulary_bindings + } + binding = bindings_by_prop[("omop.person", "gender_concept_id")] + assert binding.vocabulary.name == "GENDER_CV" + assert binding.domain == "Gender" + assert binding.require_standard is True + + +def test_one_decision_record_per_entity_in_loaded_target() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + qnames = {r.qualified_name for r in loaded.entity_refs} + decision_qnames = {d.entity_ref.qualified_name for d in loaded.enrichment_decisions} + assert decision_qnames == qnames + + +def test_decisions_cover_five_facets_and_compact_status_agrees() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + writer = InMemoryGraphWriter() + loaded = load_target(adapter, writer=writer) + decision_by_qname = {d.entity_ref.qualified_name: d for d in loaded.enrichment_decisions} + entity_ops = [op for op in writer.ops if isinstance(op, EntityOp)] + for op in entity_ops: + record = decision_by_qname[op.qualified_name] + for facet in Facet: + assert op.enrichment_status[facet.value] == record.decisions[facet].status.value + + +def test_inline_synonyms_yield_supplied_for_semantic_aliases() -> None: + adapter = ManifestTargetAdapter(GOLDEN) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + person = next( + d for d in loaded.enrichment_decisions if d.entity_ref.qualified_name == "omop.person" + ) + assert ( + person.decisions[Facet.semantic_aliases].status + is EnrichmentStatus.supplied_by_adapter + ) + + +def test_card_version_aggregate_changes_when_one_card_bumps(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + base_path = tmp_path / "base.yaml" + base_path.write_text(yaml.safe_dump(raw)) + base = load_target( + ManifestTargetAdapter(base_path), writer=InMemoryGraphWriter() + ) + bumped: dict[str, Any] = yaml.safe_load(GOLDEN.read_text()) + bumped["entities"][0]["context_card"]["card_version"] = "1.1.0" + bumped["entities"][0]["context_card"]["description"] = "Bumped" + bumped_path = tmp_path / "bumped.yaml" + bumped_path.write_text(yaml.safe_dump(bumped)) + after = load_target( + ManifestTargetAdapter(bumped_path), writer=InMemoryGraphWriter() + ) + assert base.target_schema_snapshot_hash == after.target_schema_snapshot_hash + assert base.aggregate_context_card_version != after.aggregate_context_card_version + + +def test_endpoints_block_on_table_row_rejected(tmp_path: Path) -> None: + raw = yaml.safe_load(GOLDEN.read_text()) + raw["entities"][0]["endpoints"] = { + "subject": {"target_entity": "omop.person"}, + "object": {"target_entity": "omop.person"}, + } + bad_path = tmp_path / "bad_endpoints.yaml" + bad_path.write_text(yaml.safe_dump(raw)) + with pytest.raises(ManifestEndpointError): + ManifestTargetAdapter(bad_path) diff --git a/tests/unit/targets/test_hashing.py b/tests/unit/targets/test_hashing.py new file mode 100644 index 0000000..bc9db70 --- /dev/null +++ b/tests/unit/targets/test_hashing.py @@ -0,0 +1,136 @@ +"""SnapshotHasher determinism, schema-only projection, snapshot_hash rejection.""" + +from __future__ import annotations + +import pytest + +from sema.models.target.context_card import TargetContextCard +from sema.models.target.entity import TargetEntityDecl +from sema.models.target.normalized import NormalizedTargetModel +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import TargetPropertyDecl +from sema.targets.exceptions import AdapterContractError +from sema.targets.hashing import SnapshotHasher, compute_card_hash +from sema.targets.normalizer import TargetModelNormalizer + +from tests.unit.targets.conftest import ( + ScriptedAdapter, + make_descriptor, + make_entity_ref, + make_obligation, + make_table_row_entity, +) + +pytestmark = pytest.mark.unit + + +def _build_simple_normalized() -> NormalizedTargetModel: + entity = make_table_row_entity() + obligation = make_obligation() + adapter = ScriptedAdapter(make_descriptor(), [entity], [obligation]) + return TargetModelNormalizer.normalize(adapter) + + +def test_hash_is_64_char_hex() -> None: + digest = SnapshotHasher.hash(_build_simple_normalized()) + assert len(digest) == 64 + int(digest, 16) + + +def test_hasher_determinism_same_input_same_digest() -> None: + digest1 = SnapshotHasher.hash(_build_simple_normalized()) + digest2 = SnapshotHasher.hash(_build_simple_normalized()) + assert digest1 == digest2 + + +def test_hasher_mutated_property_type_changes_digest() -> None: + base = _build_simple_normalized() + base_digest = SnapshotHasher.hash(base) + mutated_entity = base.entities[0].model_copy( + update={ + "properties": [ + TargetPropertyDecl(name="person_id", type="integer", nullable=False) + ] + } + ) + mutated = base.model_copy(update={"entities": [mutated_entity]}) + assert SnapshotHasher.hash(mutated) != base_digest + + +def test_hasher_descriptor_display_name_and_owner_excluded() -> None: + base = _build_simple_normalized() + mutated_descriptor = base.descriptor.model_copy( + update={"display_name": "Different", "owner": "someone-else"} + ) + mutated = base.model_copy(update={"descriptor": mutated_descriptor}) + assert SnapshotHasher.hash(mutated) == SnapshotHasher.hash(base) + + +def test_hasher_context_card_changes_excluded_from_schema_hash() -> None: + base = _build_simple_normalized() + card = TargetContextCard( + entity_ref=base.entities[0].ref, + card_version="1.0.0", + description="Original", + ) + with_card = base.model_copy(update={"context_cards": [card]}) + different_card = card.model_copy(update={"description": "Different"}) + with_diff_card = base.model_copy(update={"context_cards": [different_card]}) + assert SnapshotHasher.hash(with_card) == SnapshotHasher.hash(with_diff_card) + + +def test_hasher_rejects_snapshot_hash_field_anywhere() -> None: + base = _build_simple_normalized() + descriptor_dict = base.descriptor.model_dump(mode="json") + descriptor_dict["snapshot_hash"] = "fake" + + class _ToyModel: + descriptor = base.descriptor + entities: list[TargetEntityDecl] = base.entities + obligations: list[TargetObligationDecl] = base.obligations + vocabularies = base.vocabularies + vocabulary_bindings = base.vocabulary_bindings + terms = base.terms + context_cards = base.context_cards + + def model_dump(self, mode: str = "json") -> dict[str, object]: # noqa: ARG002 + return descriptor_dict + + from sema.targets.hashing import _scan_for_snapshot_hash_field + + with pytest.raises(AdapterContractError, match="snapshot_hash"): + _scan_for_snapshot_hash_field({"some_dto": {"snapshot_hash": "x"}}) + + +def test_card_hash_determinism() -> None: + card = TargetContextCard( + entity_ref=make_entity_ref(), + card_version="1.0.0", + description="Hello", + examples=["a", "b"], + curated_synonyms=["alias"], + ) + h1 = compute_card_hash(card) + h2 = compute_card_hash(card) + assert h1 == h2 + assert len(h1) == 64 + int(h1, 16) + + +def test_card_hash_changes_on_content_mutation() -> None: + card_a = TargetContextCard( + entity_ref=make_entity_ref(), card_version="1.0.0", description="A" + ) + card_b = card_a.model_copy(update={"description": "B"}) + assert compute_card_hash(card_a) != compute_card_hash(card_b) + + +def test_card_hash_changes_when_synonyms_mutate() -> None: + base = TargetContextCard( + entity_ref=make_entity_ref(), + card_version="1.0.0", + description="Hi", + curated_synonyms=["x"], + ) + mutated = base.model_copy(update={"curated_synonyms": ["x", "y"]}) + assert compute_card_hash(base) != compute_card_hash(mutated) diff --git a/tests/unit/targets/test_import_boundaries.py b/tests/unit/targets/test_import_boundaries.py new file mode 100644 index 0000000..f36cf12 --- /dev/null +++ b/tests/unit/targets/test_import_boundaries.py @@ -0,0 +1,166 @@ +"""Module import boundary enforcement for `sema.targets`. + +Adapters, normalizer, hashing, and registry MUST NOT import from +`sema.engine`, `sema.pipeline`, or `sema.graph`. The materializer MAY +import from `sema.graph` but MUST NOT import from `sema.engine` or +`sema.pipeline`. +""" + +from __future__ import annotations + +import ast +import importlib +import importlib.util +import sys +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.unit + +_FORBIDDEN_FOR_NON_MATERIALIZER = ("sema.engine", "sema.pipeline", "sema.graph") +_FORBIDDEN_FOR_MATERIALIZER = ("sema.engine", "sema.pipeline") +_REPO_ROOT = Path(__file__).resolve().parents[3] +_TARGETS_ROOT = _REPO_ROOT / "src" / "sema" / "targets" +_MATERIALIZER_SUFFIX = "materializer.py" + + +def _iter_target_modules() -> list[Path]: + return sorted(p for p in _TARGETS_ROOT.rglob("*.py") if p.name != "__init__.py") + + +def _module_imports(path: Path) -> set[str]: + tree = ast.parse(path.read_text()) + out: set[str] = set() + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + out.add(alias.name) + elif isinstance(node, ast.ImportFrom) and node.module is not None: + out.add(node.module) + return out + + +def _is_materializer(path: Path) -> bool: + return path.name == _MATERIALIZER_SUFFIX + + +def _forbidden_for(path: Path) -> tuple[str, ...]: + return _FORBIDDEN_FOR_MATERIALIZER if _is_materializer(path) else _FORBIDDEN_FOR_NON_MATERIALIZER + + +def _violation(import_name: str, forbidden: tuple[str, ...]) -> str | None: + for prefix in forbidden: + if import_name == prefix or import_name.startswith(prefix + "."): + return prefix + return None + + +def test_direct_imports_respect_boundaries() -> None: + violations: list[str] = [] + for path in _iter_target_modules(): + forbidden = _forbidden_for(path) + for imp in _module_imports(path): + offender = _violation(imp, forbidden) + if offender is not None: + violations.append(f"{path.relative_to(_REPO_ROOT)} imports {imp} (forbidden prefix: {offender})") + assert not violations, "import-boundary violations:\n" + "\n".join(violations) + + +def _resolve_module_path(module_name: str) -> Path | None: + spec = importlib.util.find_spec(module_name) + if spec is None or spec.origin is None or spec.origin == "built-in": + return None + return Path(spec.origin) + + +def _is_inside_repo(path: Path) -> bool: + try: + path.relative_to(_REPO_ROOT) + except ValueError: + return False + return True + + +def _walk_transitive(start: Path, max_depth: int = 5) -> set[str]: + seen_paths: set[Path] = set() + seen_modules: set[str] = set() + + def _walk(path: Path, depth: int) -> None: + if depth > max_depth or path in seen_paths or not path.exists(): + return + seen_paths.add(path) + try: + imports = _module_imports(path) + except SyntaxError: + return + for imp in imports: + seen_modules.add(imp) + if not imp.startswith("sema."): + continue + resolved = _resolve_module_path(imp) + if resolved is None or not _is_inside_repo(resolved): + continue + _walk(resolved, depth + 1) + + _walk(start, 0) + return seen_modules + + +def test_transitive_imports_respect_boundaries() -> None: + violations: list[str] = [] + for path in _iter_target_modules(): + forbidden = _forbidden_for(path) + for imp in _walk_transitive(path): + offender = _violation(imp, forbidden) + if offender is not None: + violations.append( + f"{path.relative_to(_REPO_ROOT)} transitively reaches {imp} " + f"(forbidden prefix: {offender})" + ) + assert not violations, "transitive import-boundary violations:\n" + "\n".join(violations) + + +def test_fixture_adapter_importing_engine_is_detected(tmp_path: Path) -> None: + bad = tmp_path / "fake_adapter.py" + bad.write_text("from sema.engine.semantic import infer_type\n") + imports = _module_imports(bad) + assert any(imp.startswith("sema.engine") for imp in imports) + assert _violation("sema.engine.semantic", _FORBIDDEN_FOR_NON_MATERIALIZER) == "sema.engine" + + +def test_fixture_normalizer_importing_graph_is_detected(tmp_path: Path) -> None: + bad = tmp_path / "normalizer.py" + bad.write_text("from sema.graph.loader import GraphLoader\n") + assert _violation("sema.graph.loader", _FORBIDDEN_FOR_NON_MATERIALIZER) == "sema.graph" + + +def test_fixture_materializer_importing_graph_is_allowed(tmp_path: Path) -> None: + materializer = tmp_path / "materializer.py" + materializer.write_text("from sema.graph.loader import GraphLoader\n") + assert _is_materializer(materializer) + assert _violation("sema.graph.loader", _FORBIDDEN_FOR_MATERIALIZER) is None + + +def test_fixture_materializer_importing_engine_is_rejected(tmp_path: Path) -> None: + materializer = tmp_path / "materializer.py" + materializer.write_text("from sema.engine.semantic import infer_type\n") + assert _is_materializer(materializer) + assert _violation("sema.engine.semantic", _FORBIDDEN_FOR_MATERIALIZER) == "sema.engine" + + +def test_modules_inspected_so_no_silent_skip() -> None: + modules = _iter_target_modules() + paths = [p.name for p in modules] + expected = {"base.py", "registry.py", "registry_utils.py", "exceptions.py"} + missing = expected - set(paths) + assert not missing, f"target package missing expected modules: {missing}" + + +def test_walking_an_unknown_module_is_safe(tmp_path: Path) -> None: + f = tmp_path / "nonexistent.py" + f.write_text("import does_not_exist\n") + seen = _walk_transitive(f) + assert "does_not_exist" in seen + assert importlib.util.find_spec("does_not_exist") is None + assert "sys" in sys.modules diff --git a/tests/unit/targets/test_loader.py b/tests/unit/targets/test_loader.py new file mode 100644 index 0000000..209d428 --- /dev/null +++ b/tests/unit/targets/test_loader.py @@ -0,0 +1,327 @@ +"""End-to-end loader pipeline tests: load_target orchestration + decisions + cards.""" + +from __future__ import annotations + +import pytest + +from sema.models.target.completeness import ( + SemanticCompleteness, + SemanticCompletenessAnnotations, +) +from sema.models.target.context_card import TargetContextCard +from sema.models.target.enrichment import EnrichmentStatus, Facet +from sema.models.target.properties import TargetPropertyDecl +from sema.models.target.refs import VocabularyRef, VocabularySource +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.exceptions import CardContentDriftError, LoaderStageOrderError +from sema.targets.hashing import compute_card_hash +from sema.targets.loader import load_target +from sema.targets.materializer import ( + InMemoryGraphWriter, + StageGuard, + TargetModelMaterializer, +) + +from tests.unit.targets.conftest import ( + ScriptedAdapter, + default_completeness, + make_descriptor, + make_obligation, + make_table_row_entity, +) + +pytestmark = pytest.mark.unit + + +def _basic_adapter(target_model_id: str = "fake-target") -> ScriptedAdapter: + entity = make_table_row_entity(target_model_id=target_model_id) + obligation = make_obligation() + return ScriptedAdapter(make_descriptor(target_model_id), [entity], [obligation]) + + +def test_load_target_emits_loaded_target_with_pinned_hash() -> None: + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + assert len(loaded.target_schema_snapshot_hash) == 64 + assert loaded.entity_refs[0].qualified_name == "fake.person" + assert len(loaded.enrichment_decisions) == 1 + + +def test_one_decision_record_per_entity() -> None: + a = make_table_row_entity(qualified_name="fake.alpha") + b = make_table_row_entity(qualified_name="fake.beta") + c = make_table_row_entity(qualified_name="fake.gamma") + obligations = [ + make_obligation("fake.alpha"), + make_obligation("fake.beta"), + make_obligation("fake.gamma"), + ] + adapter = ScriptedAdapter(make_descriptor(), [a, b, c], obligations) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + assert len(loaded.enrichment_decisions) == 3 + + +def test_decision_covers_five_facets_exactly() -> None: + loaded = load_target(_basic_adapter(), writer=InMemoryGraphWriter()) + record = loaded.enrichment_decisions[0] + assert set(record.decisions.keys()) == set(Facet) + + +def test_complete_facet_yields_not_required() -> None: + completeness = SemanticCompletenessAnnotations( + structure=SemanticCompleteness.COMPLETE, + obligations=SemanticCompleteness.COMPLETE, + vocabulary_bindings=SemanticCompleteness.COMPLETE, + semantic_aliases=SemanticCompleteness.COMPLETE, + terms=SemanticCompleteness.COMPLETE, + ) + descriptor = make_descriptor(completeness=completeness) + entity = make_table_row_entity() + adapter = ScriptedAdapter(descriptor, [entity], [make_obligation()]) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + statuses = {f: d.status for f, d in loaded.enrichment_decisions[0].decisions.items()} + assert all(s is EnrichmentStatus.not_required for s in statuses.values()) + + +def test_partial_facet_yields_required_deferred() -> None: + loaded = load_target(_basic_adapter(), writer=InMemoryGraphWriter()) + record = loaded.enrichment_decisions[0] + assert ( + record.decisions[Facet.semantic_aliases].status is EnrichmentStatus.required_deferred + ) + + +def test_external_facet_yields_not_required() -> None: + loaded = load_target(_basic_adapter(), writer=InMemoryGraphWriter()) + record = loaded.enrichment_decisions[0] + assert record.decisions[Facet.terms].status is EnrichmentStatus.not_required + + +def test_skip_facets_opt_out_yields_required_skipped() -> None: + loaded = load_target( + _basic_adapter(), + writer=InMemoryGraphWriter(), + skip_facets=["semantic_aliases"], + ) + record = loaded.enrichment_decisions[0] + assert ( + record.decisions[Facet.semantic_aliases].status is EnrichmentStatus.required_skipped + ) + assert record.decisions[Facet.structure].status is EnrichmentStatus.not_required + + +def test_inline_synonyms_yield_supplied_by_adapter() -> None: + entity = make_table_row_entity( + properties=[ + TargetPropertyDecl( + name="person_id", + type="string", + nullable=False, + synonyms=["alias1", "alias2"], + ) + ] + ) + adapter = ScriptedAdapter(make_descriptor(), [entity], [make_obligation()]) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + record = loaded.enrichment_decisions[0] + assert ( + record.decisions[Facet.semantic_aliases].status + is EnrichmentStatus.supplied_by_adapter + ) + + +def test_inline_terms_with_binding_yield_supplied_by_adapter_for_terms() -> None: + entity = make_table_row_entity() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE), + ) + term = TargetTermDecl( + vocabulary=VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE), + code="M", + display="Male", + ) + adapter = ScriptedAdapter( + make_descriptor(), + [entity], + [make_obligation()], + bindings=[binding], + terms=[term], + ) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + record = loaded.enrichment_decisions[0] + assert record.decisions[Facet.terms].status is EnrichmentStatus.supplied_by_adapter + + +def test_aggregate_card_version_single_entity_returns_literal() -> None: + loaded = load_target(_basic_adapter(), writer=InMemoryGraphWriter()) + assert loaded.aggregate_context_card_version == "0.0.0+synthesized" + + +def test_aggregate_card_version_multi_entity_is_deterministic_hash() -> None: + a = make_table_row_entity(qualified_name="fake.alpha") + b = make_table_row_entity(qualified_name="fake.beta") + cards = [ + TargetContextCard(entity_ref=a.ref, card_version="1.0.0", description="A"), + TargetContextCard(entity_ref=b.ref, card_version="2.1.3", description="B"), + ] + adapter = ScriptedAdapter( + make_descriptor(), + [a, b], + [make_obligation("fake.alpha"), make_obligation("fake.beta")], + cards=cards, + ) + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + aggregate = loaded.aggregate_context_card_version + assert len(aggregate) == 64 + int(aggregate, 16) + loaded2 = load_target(adapter, writer=InMemoryGraphWriter()) + assert loaded2.aggregate_context_card_version == aggregate + + +def test_card_content_drift_detected() -> None: + entity = make_table_row_entity() + card = TargetContextCard( + entity_ref=entity.ref, card_version="1.0.0", description="Original" + ) + adapter = ScriptedAdapter( + make_descriptor(), [entity], [make_obligation()], cards=[card] + ) + persisted = { + (entity.ref.target_model_id, entity.ref.qualified_name, "1.0.0"): "deadbeef" * 8, + } + with pytest.raises(CardContentDriftError): + load_target( + adapter, writer=InMemoryGraphWriter(), persisted_card_hashes=persisted + ) + + +def test_persisted_card_hashes_keyed_by_target_model_id() -> None: + """Two distinct target_model_ids that happen to declare the same + qualified_name + card_version MUST not cross-poison each other's + drift checks.""" + entity = make_table_row_entity(target_model_id="omop") + other_entity = make_table_row_entity(target_model_id="acris") + card = TargetContextCard( + entity_ref=entity.ref, card_version="1.0.0", description="omop card" + ) + other_card = TargetContextCard( + entity_ref=other_entity.ref, card_version="1.0.0", description="acris card" + ) + adapter = ScriptedAdapter( + make_descriptor(target_model_id="omop"), + [entity], + [make_obligation()], + cards=[card], + ) + other_adapter = ScriptedAdapter( + make_descriptor(target_model_id="acris"), + [other_entity], + [make_obligation()], + cards=[other_card], + ) + persisted = { + ("omop", entity.ref.qualified_name, "1.0.0"): compute_card_hash(card), + } + load_target( + adapter, writer=InMemoryGraphWriter(), persisted_card_hashes=persisted + ) + load_target( + other_adapter, + writer=InMemoryGraphWriter(), + persisted_card_hashes=persisted, + ) + + +def test_card_drift_under_same_target_model_id_still_raises() -> None: + entity = make_table_row_entity(target_model_id="omop") + card = TargetContextCard( + entity_ref=entity.ref, card_version="1.0.0", description="X" + ) + adapter = ScriptedAdapter( + make_descriptor(target_model_id="omop"), + [entity], + [make_obligation()], + cards=[card], + ) + persisted = { + ("omop", entity.ref.qualified_name, "1.0.0"): "deadbeef" * 8, + } + with pytest.raises(CardContentDriftError): + load_target( + adapter, writer=InMemoryGraphWriter(), persisted_card_hashes=persisted + ) + + +def test_card_version_bump_permits_content_change() -> None: + entity = make_table_row_entity() + card_v1 = TargetContextCard( + entity_ref=entity.ref, card_version="1.0.0", description="Old" + ) + adapter_v1 = ScriptedAdapter( + make_descriptor(), [entity], [make_obligation()], cards=[card_v1] + ) + loaded_v1 = load_target(adapter_v1, writer=InMemoryGraphWriter()) + persisted = { + (entity.ref.target_model_id, entity.ref.qualified_name, "1.0.0"): compute_card_hash(card_v1), + } + card_v2 = TargetContextCard( + entity_ref=entity.ref, card_version="1.1.0", description="New" + ) + adapter_v2 = ScriptedAdapter( + make_descriptor(), [entity], [make_obligation()], cards=[card_v2] + ) + loaded_v2 = load_target( + adapter_v2, + writer=InMemoryGraphWriter(), + persisted_card_hashes=persisted, + ) + assert loaded_v1.aggregate_context_card_version == "1.0.0" + assert loaded_v2.aggregate_context_card_version == "1.1.0" + + +def test_lazy_subset_hash_differs_from_full() -> None: + a = make_table_row_entity(qualified_name="fake.alpha") + b = make_table_row_entity(qualified_name="fake.beta") + obligations = [make_obligation("fake.alpha"), make_obligation("fake.beta")] + adapter = ScriptedAdapter(make_descriptor(), [a, b], obligations) + full = load_target(adapter, writer=InMemoryGraphWriter()) + subset = load_target( + adapter, writer=InMemoryGraphWriter(), selected_refs=[a.ref] + ) + assert full.target_schema_snapshot_hash != subset.target_schema_snapshot_hash + + +def test_stage_spy_observes_normalize_hash_materialize_order() -> None: + events: list[str] = [] + load_target( + _basic_adapter(), writer=InMemoryGraphWriter(), stage_spy=events.append + ) + assert events == [ + "normalize_started", + "normalize_completed", + "hash_started", + "hash_completed", + "materialize_started", + "materialize_completed", + ] + + +def test_calling_materializer_before_hash_raises() -> None: + writer = InMemoryGraphWriter() + guard = StageGuard() + adapter = _basic_adapter() + from sema.targets.normalizer import TargetModelNormalizer + + normalized = TargetModelNormalizer.normalize(adapter) + guard.transition_to(StageGuard.NORMALIZED) + with pytest.raises(LoaderStageOrderError): + TargetModelMaterializer.write( + normalized, + "0" * 64, + writer, + enrichment_decisions=[], + stage_guard=guard, + ) diff --git a/tests/unit/targets/test_manifest_adapter.py b/tests/unit/targets/test_manifest_adapter.py new file mode 100644 index 0000000..993a11e --- /dev/null +++ b/tests/unit/targets/test_manifest_adapter.py @@ -0,0 +1,498 @@ +"""ManifestTargetAdapter parser + adapter behavior.""" + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +import yaml + +from sema.models.planner._enums import PrimaryKeyStrategy, TargetArtifactKind +from sema.models.target.completeness import SemanticCompleteness +from sema.models.target.refs import VocabularyRef, VocabularySource +from sema.targets import TargetOntologyAdapter +from sema.targets.adapters.manifest import ManifestTargetAdapter +from sema.targets.adapters.manifest_exceptions import ( + ManifestContextCardError, + ManifestEndpointError, + ManifestReservedNameError, + ManifestSchemaError, + UnsupportedManifestExtensionError, + UnsupportedManifestVersionError, +) +from sema.targets.loader import load_target +from sema.targets.materializer import InMemoryGraphWriter + +pytestmark = pytest.mark.unit + + +def _write_yaml(tmp_path: Path, content: dict, name: str = "manifest.yaml") -> Path: + path = tmp_path / name + path.write_text(yaml.safe_dump(content)) + return path + + +def _write_json(tmp_path: Path, content: dict, name: str = "manifest.json") -> Path: + path = tmp_path / name + path.write_text(json.dumps(content)) + return path + + +def _minimal_manifest() -> dict: + return { + "manifest_version": 1, + "descriptor": { + "target_model_id": "fake-target", + "target_model_version": "1.0.0", + "display_name": "Fake", + }, + "entities": [ + { + "qualified_name": "fake.person", + "kind": "TABLE_ROW", + "properties": [ + {"name": "person_id", "type": "integer", "nullable": False} + ], + "obligation": { + "required_fields": ["person_id"], + "primary_key": "NATURAL_KEY", + }, + } + ], + } + + +def test_minimal_yaml_manifest_parses(tmp_path: Path) -> None: + path = _write_yaml(tmp_path, _minimal_manifest()) + adapter = ManifestTargetAdapter(path) + refs = list(adapter.discover_entities()) + assert len(refs) == 1 + assert refs[0].qualified_name == "fake.person" + + +def test_yaml_and_json_manifests_produce_equal_parsed_manifest(tmp_path: Path) -> None: + payload = _minimal_manifest() + yaml_path = _write_yaml(tmp_path, payload) + json_path = _write_json(tmp_path, payload) + yaml_adapter = ManifestTargetAdapter(yaml_path) + json_adapter = ManifestTargetAdapter(json_path) + assert yaml_adapter.parsed_manifest == json_adapter.parsed_manifest + + +def test_unknown_manifest_version_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["manifest_version"] = 99 + path = _write_yaml(tmp_path, payload) + with pytest.raises(UnsupportedManifestVersionError, match="99"): + ManifestTargetAdapter(path) + + +def test_missing_descriptor_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + del payload["descriptor"] + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestSchemaError, match="descriptor"): + ManifestTargetAdapter(path) + + +def test_unknown_extension_rejected(tmp_path: Path) -> None: + path = tmp_path / "manifest.txt" + path.write_text("not a manifest") + with pytest.raises(UnsupportedManifestExtensionError): + ManifestTargetAdapter(path) + + +def test_full_dto_coverage_from_rich_manifest(tmp_path: Path) -> None: + payload = _rich_manifest() + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + writer = InMemoryGraphWriter() + loaded = load_target(adapter, writer=writer) + assert len(loaded.entity_refs) >= 2 + + +def test_llm_client_invoke_never_called_during_parsing(tmp_path: Path) -> None: + spy = MagicMock() + path = _write_yaml(tmp_path, _minimal_manifest()) + ManifestTargetAdapter(path) + assert spy.invoke.call_count == 0 + + +def test_external_vocabulary_binding_yields_no_terms(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["vocabularies"] = [{"name": "SNOMED", "source": "EXTERNAL"}] + payload["entities"][0]["properties"][0]["vocabulary_binding"] = { + "vocabulary": "SNOMED", + "domain": "Identifier", + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + vocab = VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL) + with pytest.raises(NotImplementedError): + next(adapter.iter_terms(vocab)) + + +def test_inline_terms_produce_term_decls(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["vocabularies"] = [{"name": "GENDER_CV", "source": "INLINE"}] + payload["terms"] = [ + {"vocabulary": "GENDER_CV", "code": "M", "display": "Male"}, + {"vocabulary": "GENDER_CV", "code": "F", "display": "Female"}, + ] + payload["entities"][0]["properties"][0]["vocabulary_binding"] = { + "vocabulary": "GENDER_CV", + "domain": "Gender", + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + vocab = VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE) + inline_terms = list(adapter.iter_terms(vocab)) + assert {t.code for t in inline_terms} == {"M", "F"} + + +def test_descriptor_default_completeness_when_omitted(tmp_path: Path) -> None: + path = _write_yaml(tmp_path, _minimal_manifest()) + adapter = ManifestTargetAdapter(path) + descriptor = adapter.describe() + assert descriptor.completeness.structure is SemanticCompleteness.COMPLETE + assert descriptor.completeness.obligations is SemanticCompleteness.COMPLETE + assert descriptor.completeness.vocabulary_bindings is SemanticCompleteness.PARTIAL + assert descriptor.completeness.semantic_aliases is SemanticCompleteness.PARTIAL + assert descriptor.completeness.terms is SemanticCompleteness.EXTERNAL + + +def test_explicit_descriptor_completeness_overrides_default(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["descriptor"]["completeness"] = { + "structure": "COMPLETE", + "obligations": "COMPLETE", + "vocabulary_bindings": "COMPLETE", + "semantic_aliases": "COMPLETE", + "terms": "COMPLETE", + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + descriptor = adapter.describe() + assert all( + getattr(descriptor.completeness, f) is SemanticCompleteness.COMPLETE + for f in ( + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + ) + ) + + +def test_synonyms_upgrade_entity_semantic_aliases_to_complete(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"][0]["properties"][0]["synonyms"] = ["alias"] + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + entity = adapter.load_entity(next(iter(adapter.discover_entities()))) + assert entity.completeness is not None + assert entity.completeness.semantic_aliases is SemanticCompleteness.COMPLETE + + +def test_inline_terms_upgrade_entity_terms_to_complete(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["vocabularies"] = [{"name": "GENDER_CV", "source": "INLINE"}] + payload["terms"] = [{"vocabulary": "GENDER_CV", "code": "M", "display": "Male"}] + payload["entities"][0]["properties"][0]["vocabulary_binding"] = { + "vocabulary": "GENDER_CV" + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + entity = adapter.load_entity(next(iter(adapter.discover_entities()))) + assert entity.completeness is not None + assert entity.completeness.terms is SemanticCompleteness.COMPLETE + + +def test_table_row_with_endpoints_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"][0]["endpoints"] = { + "subject": {"target_entity": "x"}, + "object": {"target_entity": "y"}, + } + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestEndpointError, match="forbids endpoints"): + ManifestTargetAdapter(path) + + +def test_graph_edge_missing_endpoints_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"] = [ + { + "qualified_name": "fake.OWNS", + "kind": "GRAPH_EDGE", + "properties": [], + "obligation": { + "required_fields": ["subject"], + "primary_key": "NATURAL_KEY", + }, + } + ] + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestEndpointError, match="requires endpoints"): + ManifestTargetAdapter(path) + + +def test_graph_edge_endpoint_targets_table_row_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"].append( + { + "qualified_name": "fake.OWNS", + "kind": "GRAPH_EDGE", + "endpoints": { + "subject": {"target_entity": "fake.person"}, + "object": {"target_entity": "fake.person"}, + }, + "properties": [], + "obligation": { + "required_fields": ["subject"], + "primary_key": "NATURAL_KEY", + }, + } + ) + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestEndpointError, match="TABLE_ROW"): + ManifestTargetAdapter(path) + + +def test_reserved_property_name_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"][0]["properties"].append( + {"name": "subject", "type": "string", "nullable": False} + ) + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestReservedNameError, match="reserved"): + ManifestTargetAdapter(path) + + +def test_missing_card_version_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"][0]["context_card"] = {"description": "Hello"} + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestSchemaError, match="card_version"): + ManifestTargetAdapter(path) + + +def test_card_hash_in_manifest_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"][0]["context_card"] = { + "card_version": "1.0.0", + "description": "Hello", + "card_hash": "deadbeef", + } + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestContextCardError, match="card_hash"): + ManifestTargetAdapter(path) + + +def test_empty_card_description_rejected(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["entities"][0]["context_card"] = { + "card_version": "1.0.0", + "description": "", + } + path = _write_yaml(tmp_path, payload) + with pytest.raises(ManifestSchemaError, match="description"): + ManifestTargetAdapter(path) + + +def test_synthesized_card_when_block_omitted(tmp_path: Path) -> None: + path = _write_yaml(tmp_path, _minimal_manifest()) + adapter = ManifestTargetAdapter(path) + ref = next(iter(adapter.discover_entities())) + card = adapter.load_context_card(ref) + assert card.card_version == "0.0.0+synthesized" + assert card.description.startswith("Auto-generated card for fake.person") + + +def test_graph_edge_endpoints_round_trip(tmp_path: Path) -> None: + payload = _acris_manifest() + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + refs = {r.qualified_name: r for r in adapter.discover_entities()} + edge_entity = adapter.load_entity(refs["acris.OWNS"]) + assert edge_entity.endpoints is not None + assert edge_entity.endpoints.subject.target_entity.qualified_name == "acris.LLC" + assert edge_entity.endpoints.subject.cardinality == "one" + assert edge_entity.endpoints.subject.nullable is False + + +def test_graph_edge_minimum_viable_row_default_is_subject_and_object( + tmp_path: Path, +) -> None: + payload = _acris_manifest() + payload["entities"][2]["obligation"] = { + "required_fields": ["subject", "object"], + "primary_key": "NATURAL_KEY", + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + obligation = adapter.load_obligation( + next(r for r in adapter.discover_entities() if r.qualified_name == "acris.OWNS") + ) + minimum = obligation.minimum_viable_row + assert minimum is not None + fields = sorted(c.field for c in minimum.clauses) + assert fields == ["object", "subject"] + + +def test_graph_edge_explicit_temporal_minimum_honored(tmp_path: Path) -> None: + payload = _acris_manifest() + payload["entities"][2]["obligation"] = { + "required_fields": ["subject", "object", "valid_from"], + "primary_key": "NATURAL_KEY", + "minimum_viable_row": { + "op": "AND", + "clauses": [ + {"kind": "presence", "field": "subject"}, + {"kind": "presence", "field": "object"}, + {"kind": "presence", "field": "valid_from"}, + ], + }, + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + obligation = adapter.load_obligation( + next(r for r in adapter.discover_entities() if r.qualified_name == "acris.OWNS") + ) + minimum = obligation.minimum_viable_row + assert minimum is not None + fields = sorted(c.field for c in minimum.clauses) + assert fields == ["object", "subject", "valid_from"] + + +def test_vocabulary_binding_round_trip(tmp_path: Path) -> None: + payload = _minimal_manifest() + payload["vocabularies"] = [{"name": "SNOMED", "source": "EXTERNAL"}] + payload["entities"][0]["properties"][0]["vocabulary_binding"] = { + "vocabulary": "SNOMED", + "domain": "Gender", + "require_standard": True, + "allow_zero_default": False, + "resolver_policy_ref": "policy-A", + } + path = _write_yaml(tmp_path, payload) + adapter = ManifestTargetAdapter(path) + ref = next(iter(adapter.discover_entities())) + from sema.models.target.refs import TargetPropertyRef + + bindings = list( + adapter.load_vocabulary_bindings( + TargetPropertyRef(entity_ref=ref, property_name="person_id") + ) + ) + assert len(bindings) == 1 + binding = bindings[0] + assert binding.vocabulary.name == "SNOMED" + assert binding.vocabulary.source is VocabularySource.EXTERNAL + assert binding.domain == "Gender" + assert binding.require_standard is True + assert binding.resolver_policy_ref == "policy-A" + + +def test_adapter_satisfies_runtime_protocol(tmp_path: Path) -> None: + path = _write_yaml(tmp_path, _minimal_manifest()) + adapter = ManifestTargetAdapter(path) + assert isinstance(adapter, TargetOntologyAdapter) + + +def test_load_unknown_entity_raises_keyerror(tmp_path: Path) -> None: + path = _write_yaml(tmp_path, _minimal_manifest()) + adapter = ManifestTargetAdapter(path) + bogus = next(iter(adapter.discover_entities())).model_copy( + update={"qualified_name": "fake.unknown"} + ) + with pytest.raises(KeyError, match="fake.unknown"): + adapter.load_entity(bogus) + + +def _acris_manifest() -> dict: + return { + "manifest_version": 1, + "descriptor": { + "target_model_id": "acris-nyc", + "target_model_version": "1.0.0", + "display_name": "ACRIS NYC", + }, + "entities": [ + { + "qualified_name": "acris.LLC", + "kind": "GRAPH_NODE", + "properties": [{"name": "name", "type": "string", "nullable": False}], + "obligation": { + "required_fields": ["name"], + "primary_key": "NATURAL_KEY", + }, + }, + { + "qualified_name": "acris.Property", + "kind": "GRAPH_NODE", + "properties": [{"name": "name", "type": "string", "nullable": False}], + "obligation": { + "required_fields": ["name"], + "primary_key": "NATURAL_KEY", + }, + }, + { + "qualified_name": "acris.OWNS", + "kind": "GRAPH_EDGE", + "endpoints": { + "subject": { + "target_entity": "acris.LLC", + "cardinality": "one", + "nullable": False, + }, + "object": { + "target_entity": "acris.Property", + "cardinality": "one", + "nullable": False, + }, + }, + "properties": [ + {"name": "valid_from", "type": "date", "nullable": False} + ], + "obligation": { + "required_fields": ["subject", "object", "valid_from"], + "primary_key": "NATURAL_KEY", + "minimum_viable_row": { + "op": "AND", + "clauses": [ + {"kind": "presence", "field": "subject"}, + {"kind": "presence", "field": "object"}, + {"kind": "presence", "field": "valid_from"}, + ], + }, + }, + }, + ], + } + + +def _rich_manifest() -> dict: + payload = _minimal_manifest() + payload["entities"].append( + { + "qualified_name": "fake.observation", + "kind": "TABLE_ROW", + "properties": [ + {"name": "obs_id", "type": "integer", "nullable": False}, + ], + "obligation": { + "required_fields": ["obs_id"], + "primary_key": "NATURAL_KEY", + }, + "context_card": { + "card_version": "1.0.0", + "description": "Observation table.", + }, + } + ) + payload["vocabularies"] = [{"name": "SNOMED", "source": "EXTERNAL"}] + return payload diff --git a/tests/unit/targets/test_manifest_registration.py b/tests/unit/targets/test_manifest_registration.py new file mode 100644 index 0000000..4c0067c --- /dev/null +++ b/tests/unit/targets/test_manifest_registration.py @@ -0,0 +1,69 @@ +"""ManifestTargetAdapter registry registration (task 6.17).""" + +from __future__ import annotations + +import pytest + +from sema.targets.adapters.manifest import ( + MANIFEST_ADAPTER_ID, + MANIFEST_REGISTRY_TARGET_MODEL_ID, + ManifestTargetAdapter, + register_manifest_adapter, +) +from sema.targets.registry import get, list_registered + +pytestmark = pytest.mark.unit + + +def test_register_manifest_adapter_registers_under_sentinel() -> None: + register_manifest_adapter() + cls = get(MANIFEST_ADAPTER_ID, MANIFEST_REGISTRY_TARGET_MODEL_ID) + assert cls is ManifestTargetAdapter + + +def test_manifest_registration_uses_wildcard_supported_versions() -> None: + register_manifest_adapter() + rows = list_registered() + assert (MANIFEST_ADAPTER_ID, MANIFEST_REGISTRY_TARGET_MODEL_ID, "") in rows + + +def test_register_manifest_adapter_is_idempotent_after_clear() -> None: + register_manifest_adapter() + register_manifest_adapter() + cls = get(MANIFEST_ADAPTER_ID, MANIFEST_REGISTRY_TARGET_MODEL_ID) + assert cls is ManifestTargetAdapter + + +def test_manifest_lookup_resolves_for_arbitrary_target_model_id() -> None: + register_manifest_adapter() + assert get("manifest", "acris-nyc") is ManifestTargetAdapter + assert get("manifest", "omop-cdm", "5.4") is ManifestTargetAdapter + + +def test_manifest_wildcard_does_not_shadow_specific_registration() -> None: + """A specific (adapter_id, target_model_id) registration overrides + the wildcard manifest registration for that exact key.""" + from sema.targets.registry import register_target_adapter + + register_manifest_adapter() + specific = type("SpecificAdapter", (ManifestTargetAdapter,), {}) + register_target_adapter( + adapter_id="manifest", + target_model_id="omop-cdm", + supported_versions="", + )(specific) + assert get("manifest", "omop-cdm") is specific + assert get("manifest", "acris-nyc") is ManifestTargetAdapter + + +def test_unknown_target_for_non_wildcard_adapter_still_raises( + fake_adapter_cls: type, +) -> None: + from sema.targets.registry import register_target_adapter as reg + from sema.targets import UnknownAdapterError + + reg(adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions="")( + fake_adapter_cls + ) + with pytest.raises(UnknownAdapterError): + get("omop_cdm", "no-such-model") diff --git a/tests/unit/targets/test_materializer.py b/tests/unit/targets/test_materializer.py new file mode 100644 index 0000000..16e9f61 --- /dev/null +++ b/tests/unit/targets/test_materializer.py @@ -0,0 +1,485 @@ +"""Materializer behavior: write sequence, idempotency, endpoint properties.""" + +from __future__ import annotations + +import pytest + +from sema.models.planner._enums import PrimaryKeyStrategy +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import TargetPropertyDecl +from sema.targets.loader import load_target +from sema.targets.materializer import InMemoryGraphWriter +from sema.targets.materializer_ops import ( + ConstraintOp, + ContextCardOp, + EnrichmentDecisionOp, + EntityOp, + PropertyOp, + TargetObligationOp, + VocabularyBindingOp, +) + +from tests.unit.targets.conftest import ( + ScriptedAdapter, + make_descriptor, + make_graph_edge_entity, + make_graph_node_entity, + make_obligation, + make_table_row_entity, +) + +pytestmark = pytest.mark.unit + + +def _basic_adapter() -> ScriptedAdapter: + entity = make_table_row_entity() + obligation = make_obligation() + return ScriptedAdapter(make_descriptor(), [entity], [obligation]) + + +def test_materializer_writes_entity_property_obligation_decision() -> None: + writer = InMemoryGraphWriter() + load_target(_basic_adapter(), writer=writer) + op_types = [type(op).__name__ for op in writer.ops] + assert "EntityOp" in op_types + assert "PropertyOp" in op_types + assert "TargetObligationOp" in op_types + assert "EnrichmentDecisionOp" in op_types + + +def test_idempotent_write_sequence() -> None: + adapter = _basic_adapter() + writer1 = InMemoryGraphWriter() + writer2 = InMemoryGraphWriter() + load_target(adapter, writer=writer1) + load_target(adapter, writer=writer2) + assert _normalize_ops(writer1.ops) == _normalize_ops(writer2.ops) + + +def _normalize_ops(ops: list[object]) -> list[tuple[str, str]]: + return [(type(op).__name__, _digest_for_op(op)) for op in ops] + + +def _digest_for_op(op: object) -> str: + if isinstance(op, EnrichmentDecisionOp): + return f"{op.entity_ref}|{op.target_schema_snapshot_hash}" + if isinstance(op, EntityOp): + return f"{op.qualified_name}|{op.target_schema_snapshot_hash}" + if isinstance(op, PropertyOp): + return ( + f"{op.parent_entity_qualified_name}.{op.name}|" + f"{op.target_schema_snapshot_hash}|{op.property_kind}" + ) + if isinstance(op, TargetObligationOp): + return f"{op.target_entity}|{op.target_schema_snapshot_hash}" + return repr(op) + + +def test_endpoint_properties_emitted_with_endpoint_kind() -> None: + llc = make_graph_node_entity(qualified_name="acris.LLC") + prop_node = make_graph_node_entity(qualified_name="acris.Property") + edge = make_graph_edge_entity( + target_model_id="fake-target", + qualified_name="acris.OWNS", + subject_target=llc.ref, + object_target=prop_node.ref, + columnar_properties=[ + TargetPropertyDecl(name="valid_from", type="date", nullable=False) + ], + ) + adapter = ScriptedAdapter( + make_descriptor(), + [llc, prop_node, edge], + [ + make_obligation("acris.LLC", required_fields=["name"]), + make_obligation("acris.Property", required_fields=["name"]), + TargetObligationDecl( + target_entity="acris.OWNS", + required_fields=["subject", "object", "valid_from"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + ), + ], + ) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + edge_props = [ + op + for op in writer.ops + if isinstance(op, PropertyOp) and op.parent_entity_qualified_name == "acris.OWNS" + ] + by_name = {p.name: p for p in edge_props} + assert {"subject", "object", "valid_from"} <= set(by_name) + subject = by_name["subject"] + assert subject.property_kind == "ENDPOINT" + assert subject.endpoint_role == "subject" + assert subject.endpoint_target_entity_qualified_name == "acris.LLC" + assert subject.materialized_as_edge_property is False + valid_from = by_name["valid_from"] + assert valid_from.property_kind == "COLUMN" + assert valid_from.materialized_as_edge_property is True + + +def test_entity_op_carries_target_provenance_and_compact_status() -> None: + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + entity_ops = [op for op in writer.ops if isinstance(op, EntityOp)] + assert len(entity_ops) == 1 + op = entity_ops[0] + assert op.target_model_id == "fake-target" + assert op.target_schema_snapshot_hash == loaded.target_schema_snapshot_hash + assert set(op.enrichment_status.keys()) == { + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + } + + +def test_materializer_writes_constraint_for_each_domain_constraint() -> None: + from sema.models.planner.target_model import DomainConstraint + + entity = make_table_row_entity( + properties=[ + TargetPropertyDecl(name="person_id", type="string", nullable=False), + TargetPropertyDecl( + name="gender_concept_id", type="string", nullable=False + ), + ], + ) + obligation = TargetObligationDecl( + target_entity="fake.person", + required_fields=["person_id"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + domain_constraints=[ + DomainConstraint( + property_name="gender_concept_id", domain_id="omop.Gender" + ), + ], + ) + adapter = ScriptedAdapter(make_descriptor(), [entity], [obligation]) + writer = InMemoryGraphWriter() + loaded = load_target(adapter, writer=writer) + constraint_ops = [op for op in writer.ops if isinstance(op, ConstraintOp)] + assert len(constraint_ops) == 1 + cop = constraint_ops[0] + assert cop.target_model_id == "fake-target" + assert cop.target_schema_snapshot_hash == loaded.target_schema_snapshot_hash + assert cop.attached_property_id == "fake.person.gender_concept_id" + assert cop.constraint_kind == "domain_binding" + assert cop.payload == {"domain_id": "omop.Gender"} + + +def test_materializer_skips_constraints_for_endpoint_properties() -> None: + from sema.models.planner.target_model import DomainConstraint + + llc = make_graph_node_entity(qualified_name="acris.LLC") + prop_node = make_graph_node_entity(qualified_name="acris.Property") + edge = make_graph_edge_entity( + target_model_id="fake-target", + qualified_name="acris.OWNS", + subject_target=llc.ref, + object_target=prop_node.ref, + ) + edge_obligation = TargetObligationDecl( + target_entity="acris.OWNS", + required_fields=["subject", "object"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + domain_constraints=[ + DomainConstraint(property_name="subject", domain_id="ignored"), + ], + ) + adapter = ScriptedAdapter( + make_descriptor(), + [llc, prop_node, edge], + [ + make_obligation("acris.LLC", required_fields=["name"]), + make_obligation("acris.Property", required_fields=["name"]), + edge_obligation, + ], + ) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + constraint_ops = [op for op in writer.ops if isinstance(op, ConstraintOp)] + edge_constraints = [ + c for c in constraint_ops if c.attached_property_id.startswith("acris.OWNS.") + ] + assert edge_constraints == [] + + +def test_constraint_op_idempotent_across_runs() -> None: + from sema.models.planner.target_model import DomainConstraint + + entity = make_table_row_entity( + properties=[ + TargetPropertyDecl(name="person_id", type="string", nullable=False), + TargetPropertyDecl( + name="gender_concept_id", type="string", nullable=False + ), + ], + ) + obligation = TargetObligationDecl( + target_entity="fake.person", + required_fields=["person_id"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + domain_constraints=[ + DomainConstraint( + property_name="gender_concept_id", domain_id="omop.Gender" + ), + ], + ) + adapter = ScriptedAdapter(make_descriptor(), [entity], [obligation]) + w1 = InMemoryGraphWriter() + w2 = InMemoryGraphWriter() + load_target(adapter, writer=w1) + load_target(adapter, writer=w2) + c1 = [op for op in w1.ops if isinstance(op, ConstraintOp)] + c2 = [op for op in w2.ops if isinstance(op, ConstraintOp)] + assert c1 == c2 + + +def test_has_property_relationship_carries_full_hash_versioned_keys() -> None: + from sema.targets.materializer_ops import RelationshipOp + + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + rels = [ + op + for op in writer.ops + if isinstance(op, RelationshipOp) and op.rel_type == "HAS_PROPERTY" + ] + assert len(rels) == 1 + rel = rels[0] + h = loaded.target_schema_snapshot_hash + for keys in (rel.from_keys, rel.to_keys): + assert keys["target_model_id"] == "fake-target" + assert keys["target_model_version"] == "1.0.0" + assert keys["target_schema_snapshot_hash"] == h + + +def test_has_obligation_relationship_emitted_with_full_keys() -> None: + from sema.targets.materializer_ops import RelationshipOp + + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + rels = [ + op + for op in writer.ops + if isinstance(op, RelationshipOp) and op.rel_type == "HAS_OBLIGATION" + ] + assert len(rels) == 1 + rel = rels[0] + h = loaded.target_schema_snapshot_hash + for keys in (rel.from_keys, rel.to_keys): + assert keys["target_model_id"] == "fake-target" + assert keys["target_model_version"] == "1.0.0" + assert keys["target_schema_snapshot_hash"] == h + + +def test_has_enrichment_decision_relationship_carries_full_keys() -> None: + from sema.targets.materializer_ops import RelationshipOp + + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + rels = [ + op + for op in writer.ops + if isinstance(op, RelationshipOp) and op.rel_type == "HAS_ENRICHMENT_DECISION" + ] + assert len(rels) == 1 + rel = rels[0] + h = loaded.target_schema_snapshot_hash + for keys in (rel.from_keys, rel.to_keys): + assert keys["target_model_id"] == "fake-target" + assert keys["target_model_version"] == "1.0.0" + assert keys["target_schema_snapshot_hash"] == h + + +def test_materializer_emits_current_flip_op_for_loaded_logical_keys() -> None: + from sema.targets.materializer_ops import CurrentFlipOp + + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + flips = [op for op in writer.ops if isinstance(op, CurrentFlipOp)] + assert len(flips) == 1 + flip = flips[0] + assert flip.target_model_id == "fake-target" + assert flip.target_model_version == "1.0.0" + assert flip.current_snapshot_hash == loaded.target_schema_snapshot_hash + assert "fake.person" in flip.entity_qualified_names + + +def test_lazy_subset_flip_scoped_to_selected_only() -> None: + from sema.targets.materializer_ops import CurrentFlipOp + + e1 = make_table_row_entity(qualified_name="omop.person") + e2 = make_table_row_entity(qualified_name="omop.observation") + o1 = make_obligation("omop.person") + o2 = make_obligation("omop.observation") + adapter = ScriptedAdapter(make_descriptor(), [e1, e2], [o1, o2]) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer, selected_refs=[e1.ref]) + flips = [op for op in writer.ops if isinstance(op, CurrentFlipOp)] + assert len(flips) == 1 + assert flips[0].entity_qualified_names == ("omop.person",) + + +def test_vocabulary_binding_op_emitted_with_full_hooks() -> None: + from sema.models.target.refs import VocabularyRef, VocabularySource + from sema.models.target.vocab_binding import VocabularyBindingDecl + + entity = make_table_row_entity( + properties=[ + TargetPropertyDecl(name="person_id", type="string", nullable=False), + TargetPropertyDecl( + name="gender_concept_id", type="string", nullable=False + ), + ], + ) + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="gender_concept_id", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + domain="Gender", + require_standard=True, + allow_zero_default=True, + effective_date_ref="visit.start_date", + resolver_policy_ref="omop.snomed.gender.v1", + ) + adapter = ScriptedAdapter( + make_descriptor(), + [entity], + [make_obligation(target_entity=entity.ref.qualified_name)], + bindings=[binding], + ) + writer = InMemoryGraphWriter() + loaded = load_target(adapter, writer=writer) + binding_ops = [op for op in writer.ops if isinstance(op, VocabularyBindingOp)] + assert len(binding_ops) == 1 + bop = binding_ops[0] + assert bop.parent_entity_qualified_name == entity.ref.qualified_name + assert bop.property_name == "gender_concept_id" + assert bop.vocabulary_name == "SNOMED" + assert bop.vocabulary_source == "EXTERNAL" + assert bop.domain == "Gender" + assert bop.require_standard is True + assert bop.allow_zero_default is True + assert bop.effective_date_ref == "visit.start_date" + assert bop.resolver_policy_ref == "omop.snomed.gender.v1" + assert bop.target_schema_snapshot_hash == loaded.target_schema_snapshot_hash + + +def test_vocabulary_binding_relationship_emitted() -> None: + from sema.models.target.refs import VocabularyRef, VocabularySource + from sema.models.target.vocab_binding import VocabularyBindingDecl + from sema.targets.materializer_ops import RelationshipOp + + entity = make_table_row_entity() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + ) + adapter = ScriptedAdapter( + make_descriptor(), + [entity], + [make_obligation()], + bindings=[binding], + ) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + rels = [ + op + for op in writer.ops + if isinstance(op, RelationshipOp) and op.rel_type == "HAS_VOCABULARY_BINDING" + ] + assert len(rels) == 1 + assert rels[0].from_label == "Property" + assert rels[0].to_label == "VocabularyBinding" + + +def test_context_card_op_emitted_with_content_and_hash() -> None: + writer = InMemoryGraphWriter() + loaded = load_target(_basic_adapter(), writer=writer) + card_ops = [op for op in writer.ops if isinstance(op, ContextCardOp)] + assert len(card_ops) == 1 + cop = card_ops[0] + assert cop.entity_qualified_name == "fake.person" + assert cop.card_version == "0.0.0+synthesized" + assert len(cop.card_hash) == 64 + assert cop.target_schema_snapshot_hash == loaded.target_schema_snapshot_hash + + +def test_context_card_relationship_emitted_from_entity() -> None: + from sema.targets.materializer_ops import RelationshipOp + + writer = InMemoryGraphWriter() + load_target(_basic_adapter(), writer=writer) + rels = [ + op + for op in writer.ops + if isinstance(op, RelationshipOp) and op.rel_type == "HAS_CONTEXT_CARD" + ] + assert len(rels) == 1 + assert rels[0].from_label == "Entity" + assert rels[0].to_label == "ContextCard" + + +def test_loaded_target_exposes_context_cards_and_hashes() -> None: + loaded = load_target(_basic_adapter(), writer=InMemoryGraphWriter()) + assert len(loaded.context_cards) == 1 + assert loaded.context_cards[0].entity_ref.qualified_name == "fake.person" + assert "fake.person" in loaded.card_hashes + assert len(loaded.card_hashes["fake.person"]) == 64 + + +def test_current_flip_op_includes_binding_card_term_keys() -> None: + from sema.models.target.refs import VocabularyRef, VocabularySource + from sema.models.target.term import TargetTermDecl + from sema.models.target.vocab_binding import VocabularyBindingDecl + from sema.targets.materializer_ops import CurrentFlipOp + + entity = make_table_row_entity() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE), + ) + term = TargetTermDecl( + vocabulary=VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE), + code="M", + display="Male", + ) + adapter = ScriptedAdapter( + make_descriptor(), + [entity], + [make_obligation()], + bindings=[binding], + terms=[term], + ) + writer = InMemoryGraphWriter() + load_target(adapter, writer=writer) + flips = [op for op in writer.ops if isinstance(op, CurrentFlipOp)] + assert len(flips) == 1 + flip = flips[0] + assert ("fake.person", "person_id", "GENDER_CV") in flip.vocabulary_binding_keys + assert any(k[0] == "fake.person" for k in flip.context_card_keys) + assert ("GENDER_CV", "M") in flip.term_keys + + +def test_decision_op_decisions_json_round_trips() -> None: + import json + + writer = InMemoryGraphWriter() + load_target(_basic_adapter(), writer=writer) + decision_ops = [op for op in writer.ops if isinstance(op, EnrichmentDecisionOp)] + assert len(decision_ops) == 1 + parsed = json.loads(decision_ops[0].decisions_json) + assert set(parsed.keys()) == { + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + } + assert parsed["semantic_aliases"]["status"] == "required_deferred" diff --git a/tests/unit/targets/test_neo4j_writer.py b/tests/unit/targets/test_neo4j_writer.py new file mode 100644 index 0000000..44991f0 --- /dev/null +++ b/tests/unit/targets/test_neo4j_writer.py @@ -0,0 +1,371 @@ +"""Unit checks for Neo4jGraphWriter using a mock driver (5.2).""" + +from __future__ import annotations + +from datetime import datetime, timezone +from unittest.mock import MagicMock + +import pytest + +from sema.models.planner._enums import ModelRole +from sema.targets.materializer_ops import ( + ConstraintOp, + ContextCardOp, + CurrentFlipOp, + EnrichmentDecisionOp, + EntityOp, + PropertyOp, + RelationshipOp, + TargetObligationOp, + TermOp, + VocabularyBindingOp, +) +from sema.targets.neo4j_writer import Neo4jGraphWriter + +pytestmark = pytest.mark.unit + + +def _mock_driver_capturing(calls: list[tuple[str, dict]]) -> MagicMock: + driver = MagicMock() + session = MagicMock() + session.__enter__ = MagicMock(return_value=session) + session.__exit__ = MagicMock(return_value=False) + session.run.side_effect = lambda cypher, **kwargs: calls.append( + (cypher, kwargs) + ) + driver.session.return_value = session + return driver + + +def _entity_op(qname: str = "fake.person") -> EntityOp: + return EntityOp( + target_model_id="fake-target", + target_model_version="1.0.0", + target_schema_snapshot_hash="h" * 64, + qualified_name=qname, + kind="TABLE_ROW", + enrichment_status={ + "structure": "not_required", + "obligations": "not_required", + "vocabulary_bindings": "required_deferred", + "semantic_aliases": "required_deferred", + "terms": "not_required", + }, + ) + + +def test_write_entity_emits_merge_with_target_role() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + writer.write_entity(_entity_op()) + assert len(calls) == 1 + cypher, params = calls[0] + assert "MERGE (n:Entity" in cypher + assert "qualified_name: $qualified_name" in cypher + assert params["model_role"] == ModelRole.TARGET.value + assert params["is_current"] is True + assert params["enrichment_vocabulary_bindings_status"] == "required_deferred" + + +def test_write_property_emits_full_hash_versioned_merge_keys() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = PropertyOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + parent_entity_qualified_name="e", + name="p", + type="string", + nullable=False, + ) + writer.write_property(op) + cypher, params = calls[0] + assert "MERGE (n:Property" in cypher + assert "parent_entity_qualified_name: $parent_entity_qualified_name" in cypher + assert params["property_kind"] == "COLUMN" + assert params["materialized_as_edge_property"] is True + + +def test_write_endpoint_property_carries_endpoint_typing() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = PropertyOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + parent_entity_qualified_name="acris.OWNS", + name="subject", + type="ref", + nullable=False, + property_kind="ENDPOINT", + endpoint_role="subject", + endpoint_target_entity_qualified_name="acris.LLC", + endpoint_cardinality="one", + endpoint_nullable=False, + materialized_as_edge_property=False, + ) + writer.write_property(op) + _, params = calls[0] + assert params["property_kind"] == "ENDPOINT" + assert params["endpoint_role"] == "subject" + assert params["endpoint_target_entity_qualified_name"] == "acris.LLC" + assert params["endpoint_cardinality"] == "one" + assert params["endpoint_nullable"] is False + assert params["materialized_as_edge_property"] is False + + +def test_write_constraint_uses_payload_hash_in_merge_key() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = ConstraintOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + attached_property_id="e.p", + constraint_kind="domain_binding", + payload={"domain_id": "x"}, + payload_hash="abc123", + ) + writer.write_constraint(op) + cypher, params = calls[0] + assert "MERGE (n:Constraint" in cypher + assert "payload_hash: $payload_hash" in cypher + assert params["payload_hash"] == "abc123" + assert params["model_role"] == ModelRole.TARGET.value + + +def test_write_target_obligation_serialises_payload_json() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = TargetObligationOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + target_entity="fake.person", + payload={"required_fields": ["person_id"], "primary_key": "NATURAL_KEY"}, + ) + writer.write_target_obligation(op) + cypher, params = calls[0] + assert "MERGE (n:TargetObligation" in cypher + assert "person_id" in params["payload_json"] + assert params["model_role"] == ModelRole.TARGET.value + + +def test_write_enrichment_decision_emits_decisions_json() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = EnrichmentDecisionOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + entity_ref="fake.person", + decisions_json='{"structure":{"status":"not_required"}}', + decided_at=datetime(2026, 1, 1, tzinfo=timezone.utc), + ) + writer.write_enrichment_decision(op) + cypher, params = calls[0] + assert "MERGE (n:EnrichmentDecision" in cypher + assert params["decisions_json"].startswith("{") + assert "2026-01-01" in params["decided_at"] + + +def test_write_relationship_matches_both_endpoints_by_full_keys() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = RelationshipOp( + rel_type="HAS_PROPERTY", + target_schema_snapshot_hash="h", + from_label="Entity", + from_keys={ + "target_model_id": "t", + "target_model_version": "1", + "target_schema_snapshot_hash": "h", + "qualified_name": "e", + }, + to_label="Property", + to_keys={ + "target_model_id": "t", + "target_model_version": "1", + "target_schema_snapshot_hash": "h", + "parent_entity_qualified_name": "e", + "name": "p", + }, + ) + writer.write_relationship(op) + cypher, params = calls[0] + assert "MATCH (a:Entity)" in cypher + assert "MATCH (b:Property)" in cypher + assert "HAS_PROPERTY" in cypher + assert params["from_qualified_name"] == "e" + assert params["to_name"] == "p" + + +def test_flip_prior_generations_filters_by_logical_identity() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = CurrentFlipOp( + target_model_id="t", + target_model_version="1", + current_snapshot_hash="h-current", + entity_qualified_names=("fake.person",), + property_keys=(("fake.person", "person_id"),), + obligation_target_entities=("fake.person",), + enrichment_entity_refs=("fake.person",), + ) + writer.flip_prior_generations(op) + labels = [c[0] for c in calls] + assert any(":Entity)" in c for c in labels) + assert any(":Property)" in c for c in labels) + assert any(":TargetObligation)" in c for c in labels) + assert any(":EnrichmentDecision)" in c for c in labels) + assert any(":Constraint)" in c for c in labels) + for cypher, params in calls: + assert "n.target_schema_snapshot_hash <> $current_hash" in cypher + assert "SET n.is_current = false" in cypher + assert params["current_hash"] == "h-current" + + +def test_flip_skips_empty_buckets() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = CurrentFlipOp( + target_model_id="t", + target_model_version="1", + current_snapshot_hash="h-current", + entity_qualified_names=("only.entity",), + ) + writer.flip_prior_generations(op) + assert len(calls) == 1 + + +def test_target_obligation_merge_sets_is_current_true() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = TargetObligationOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + target_entity="fake.person", + payload={"required_fields": ["person_id"]}, + ) + writer.write_target_obligation(op) + cypher, params = calls[0] + assert "n.is_current = $is_current" in cypher + assert params["is_current"] is True + + +def test_term_merge_sets_is_current_true() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = TermOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + vocabulary_name="GENDER_CV", + code="M", + display="Male", + ) + writer.write_term(op) + cypher, params = calls[0] + assert "n.is_current = $is_current" in cypher + assert params["is_current"] is True + + +def test_vocabulary_binding_merge_keyed_on_full_tuple() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = VocabularyBindingOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + parent_entity_qualified_name="omop.person", + property_name="gender_concept_id", + vocabulary_name="SNOMED", + vocabulary_source="EXTERNAL", + domain="Gender", + require_standard=True, + allow_zero_default=False, + effective_date_ref="visit.start_date", + resolver_policy_ref="omop.snomed.gender.v1", + ) + writer.write_vocabulary_binding(op) + cypher, params = calls[0] + assert "MERGE (n:VocabularyBinding" in cypher + assert "vocabulary_name: $vocabulary_name" in cypher + assert params["domain"] == "Gender" + assert params["require_standard"] is True + assert params["effective_date_ref"] == "visit.start_date" + assert params["resolver_policy_ref"] == "omop.snomed.gender.v1" + assert params["model_role"] == ModelRole.TARGET.value + + +def test_context_card_merge_carries_card_hash_and_content() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = ContextCardOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + entity_qualified_name="omop.person", + card_version="1.0.0", + card_hash="0" * 64, + description="OMOP person row.", + examples=["sample"], + obligation_summary="PK + gender concept", + curated_synonyms=["patient"], + ) + writer.write_context_card(op) + cypher, params = calls[0] + assert "MERGE (n:ContextCard" in cypher + assert "n.card_hash = $card_hash" in cypher + assert params["card_version"] == "1.0.0" + assert params["card_hash"] == "0" * 64 + assert params["examples"] == ["sample"] + assert params["curated_synonyms"] == ["patient"] + assert params["model_role"] == ModelRole.TARGET.value + + +def test_flip_covers_obligations_terms_constraints_bindings_cards() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = CurrentFlipOp( + target_model_id="t", + target_model_version="1", + current_snapshot_hash="h-current", + entity_qualified_names=("e",), + property_keys=(("e", "p"),), + obligation_target_entities=("e",), + enrichment_entity_refs=("e",), + vocabulary_binding_keys=(("e", "p", "SNOMED"),), + context_card_keys=(("e", "1.0.0"),), + term_keys=(("SNOMED", "12345"),), + ) + writer.flip_prior_generations(op) + bodies = [c[0] for c in calls] + assert any(":TargetObligation" in b for b in bodies) + assert any(":Term" in b for b in bodies) + assert any(":Constraint" in b for b in bodies) + assert any(":VocabularyBinding" in b for b in bodies) + assert any(":ContextCard" in b for b in bodies) + for cypher, params in calls: + assert "$current_hash" in cypher + assert params["current_hash"] == "h-current" + + +def test_term_merge_keyed_on_full_hash_versioned_tuple() -> None: + calls: list[tuple[str, dict]] = [] + writer = Neo4jGraphWriter(_mock_driver_capturing(calls)) + op = TermOp( + target_model_id="t", + target_model_version="1", + target_schema_snapshot_hash="h", + vocabulary_name="GENDER_CV", + code="M", + display="Male", + ) + writer.write_term(op) + cypher, params = calls[0] + assert "vocabulary_name: $vocabulary_name" in cypher + assert "code: $code" in cypher + assert params["model_role"] == ModelRole.TARGET.value diff --git a/tests/unit/targets/test_normalizer.py b/tests/unit/targets/test_normalizer.py new file mode 100644 index 0000000..0e40792 --- /dev/null +++ b/tests/unit/targets/test_normalizer.py @@ -0,0 +1,289 @@ +"""Normalizer behavior: cross-ref resolution, ordering, endpoint synthesis.""" + +from __future__ import annotations + +import pytest + +from sema.models.planner._enums import PrimaryKeyStrategy, TargetArtifactKind +from sema.models.planner.target_model import ForeignKeyObligation +from sema.models.target.obligation import TargetObligationDecl +from sema.models.target.properties import PropertyKind, TargetPropertyDecl +from sema.models.target.refs import ( + TargetEntityRef, + VocabularyRef, + VocabularySource, +) +from sema.models.target.term import TargetTermDecl +from sema.models.target.vocab_binding import VocabularyBindingDecl +from sema.targets.exceptions import DanglingRefError +from sema.targets.normalizer import TargetModelNormalizer + +from tests.unit.targets.conftest import ( + ScriptedAdapter, + make_descriptor, + make_graph_edge_entity, + make_graph_node_entity, + make_obligation, + make_table_row_entity, +) + +pytestmark = pytest.mark.unit + + +def test_dangling_property_in_obligation_rejected() -> None: + entity = make_table_row_entity(qualified_name="fake.person") + bad_obligation = TargetObligationDecl( + target_entity="fake.person", + required_fields=["nonexistent_field"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + ) + adapter = ScriptedAdapter(make_descriptor(), [entity], [bad_obligation]) + with pytest.raises(DanglingRefError) as excinfo: + TargetModelNormalizer.normalize(adapter) + assert "nonexistent_field" in str(excinfo.value) + + +def test_dangling_vocabulary_binding_rejected() -> None: + entity = make_table_row_entity() + obligation = make_obligation() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="UNKNOWN", source=VocabularySource.INLINE), + ) + adapter = ScriptedAdapter( + make_descriptor(), [entity], [obligation], bindings=[binding] + ) + with pytest.raises(DanglingRefError, match="UNKNOWN"): + TargetModelNormalizer.normalize(adapter) + + +def test_external_vocabulary_binding_accepted() -> None: + entity = make_table_row_entity() + obligation = make_obligation() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + ) + adapter = ScriptedAdapter( + make_descriptor(), [entity], [obligation], bindings=[binding] + ) + normalized = TargetModelNormalizer.normalize(adapter) + assert any(b.vocabulary.name == "SNOMED" for b in normalized.vocabulary_bindings) + + +def test_dangling_fk_referenced_entity_rejected() -> None: + entity = make_table_row_entity() + obligation = TargetObligationDecl( + target_entity="fake.person", + required_fields=["person_id"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + foreign_keys=[ + ForeignKeyObligation( + referenced_entity="fake.absent", + join_keys=[("person_id", "ref_id")], + ) + ], + ) + adapter = ScriptedAdapter(make_descriptor(), [entity], [obligation]) + with pytest.raises(DanglingRefError, match="fake.absent"): + TargetModelNormalizer.normalize(adapter) + + +def test_endpoint_synthesis_creates_subject_and_object_properties() -> None: + llc = make_graph_node_entity(qualified_name="acris.LLC") + prop_node = make_graph_node_entity(qualified_name="acris.Property") + edge = make_graph_edge_entity( + target_model_id="fake-target", + qualified_name="acris.OWNS", + subject_target=llc.ref, + object_target=prop_node.ref, + columnar_properties=[ + TargetPropertyDecl(name="valid_from", type="date", nullable=False) + ], + ) + obligation = TargetObligationDecl( + target_entity="acris.OWNS", + required_fields=["subject", "object", "valid_from"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + ) + adapter = ScriptedAdapter( + make_descriptor(), + [llc, prop_node, edge], + [ + make_obligation("acris.LLC", required_fields=["name"]), + make_obligation("acris.Property", required_fields=["name"]), + obligation, + ], + ) + normalized = TargetModelNormalizer.normalize(adapter) + edges = [e for e in normalized.entities if e.ref.qualified_name == "acris.OWNS"] + assert len(edges) == 1 + edge_props = edges[0].properties + by_name = {p.name: p for p in edge_props} + assert {"subject", "object", "valid_from"} <= set(by_name) + assert by_name["subject"].property_kind is PropertyKind.ENDPOINT + assert by_name["subject"].endpoint_target_entity_qualified_name == "acris.LLC" + assert by_name["object"].endpoint_target_entity_qualified_name == "acris.Property" + assert by_name["valid_from"].property_kind is PropertyKind.COLUMN + sorted_names = [p.name for p in edge_props] + assert sorted_names == sorted(sorted_names) + + +def test_obligation_referencing_unsynthesized_subject_on_node_rejected() -> None: + bad_node = make_graph_node_entity(qualified_name="acris.LLC") + bad_obligation = TargetObligationDecl( + target_entity="acris.LLC", + required_fields=["subject", "name"], + primary_key=PrimaryKeyStrategy.NATURAL_KEY, + ) + adapter = ScriptedAdapter(make_descriptor(), [bad_node], [bad_obligation]) + with pytest.raises(DanglingRefError, match="subject"): + TargetModelNormalizer.normalize(adapter) + + +def test_endpoint_to_table_row_rejected() -> None: + table = make_table_row_entity(qualified_name="fake.person") + other_node = make_graph_node_entity(qualified_name="fake.LLC") + edge = make_graph_edge_entity( + target_model_id="fake-target", + qualified_name="fake.SAME_AS", + subject_target=table.ref, + object_target=other_node.ref, + ) + adapter = ScriptedAdapter( + make_descriptor(), + [table, other_node, edge], + [ + make_obligation("fake.person", required_fields=["person_id"]), + make_obligation("fake.LLC", required_fields=["name"]), + make_obligation("fake.SAME_AS", required_fields=["subject", "object"]), + ], + ) + with pytest.raises(DanglingRefError, match="TABLE_ROW"): + TargetModelNormalizer.normalize(adapter) + + +def test_endpoint_to_missing_entity_rejected() -> None: + a = make_graph_node_entity(qualified_name="acris.LLC") + edge = make_graph_edge_entity( + target_model_id="fake-target", + qualified_name="acris.OWNS", + subject_target=a.ref, + object_target=TargetEntityRef( + target_model_id="fake-target", + qualified_name="acris.Missing", + kind=TargetArtifactKind.GRAPH_NODE, + ), + ) + adapter = ScriptedAdapter( + make_descriptor(), + [a, edge], + [ + make_obligation("acris.LLC", required_fields=["name"]), + make_obligation("acris.OWNS", required_fields=["subject", "object"]), + ], + ) + with pytest.raises(DanglingRefError, match="acris.Missing"): + TargetModelNormalizer.normalize(adapter) + + +def test_normalizer_ordering_invariance_across_adapter_runs() -> None: + a = make_table_row_entity(qualified_name="z.last") + b = make_table_row_entity(qualified_name="a.first") + obligations = [ + make_obligation("z.last", required_fields=["person_id"]), + make_obligation("a.first", required_fields=["person_id"]), + ] + descriptor = make_descriptor() + n1 = TargetModelNormalizer.normalize(ScriptedAdapter(descriptor, [a, b], obligations)) + n2 = TargetModelNormalizer.normalize(ScriptedAdapter(descriptor, [b, a], obligations)) + assert n1 == n2 + assert [e.ref.qualified_name for e in n1.entities] == ["a.first", "z.last"] + + +def test_inline_terms_collected_when_iter_terms_works() -> None: + entity = make_table_row_entity() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE), + ) + term = TargetTermDecl( + vocabulary=VocabularyRef(name="GENDER_CV", source=VocabularySource.INLINE), + code="M", + display="Male", + ) + adapter = ScriptedAdapter( + make_descriptor(), + [entity], + [make_obligation()], + bindings=[binding], + terms=[term], + ) + normalized = TargetModelNormalizer.normalize(adapter) + assert any(t.code == "M" for t in normalized.terms) + + +def test_six_method_adapter_without_iter_terms_loads_cleanly() -> None: + """The protocol declares iter_terms optional. An adapter that does not + define iter_terms (and does not inherit the mixin) MUST normalize + cleanly; bound vocabularies are treated as EXTERNAL.""" + entity = make_table_row_entity() + binding = VocabularyBindingDecl( + entity_ref=entity.ref, + property_name="person_id", + vocabulary=VocabularyRef(name="SNOMED", source=VocabularySource.EXTERNAL), + ) + + class SixMethodAdapter: + def describe(self): + return make_descriptor() + + def discover_entities(self): + return [entity.ref] + + def load_entity(self, ref): + return entity + + def load_obligation(self, ref): + return make_obligation(target_entity=entity.ref.qualified_name) + + def load_vocabulary_bindings(self, ref): + return [binding] if ref.property_name == "person_id" else [] + + def load_context_card(self, ref): + from sema.models.target.context_card import TargetContextCard + + return TargetContextCard( + entity_ref=ref, + card_version="1.0.0", + description=f"card for {ref.qualified_name}", + ) + + adapter = SixMethodAdapter() + normalized = TargetModelNormalizer.normalize(adapter) + assert any(b.vocabulary.name == "SNOMED" for b in normalized.vocabulary_bindings) + assert normalized.terms == [] + from sema.targets.loader import load_target + from sema.targets.materializer import InMemoryGraphWriter + + loaded = load_target(adapter, writer=InMemoryGraphWriter()) + assert loaded.target_schema_snapshot_hash + + +def test_lazy_selected_refs_loads_subset() -> None: + a = make_table_row_entity(qualified_name="fake.alpha") + b = make_table_row_entity(qualified_name="fake.beta") + adapter = ScriptedAdapter( + make_descriptor(), + [a, b], + [ + make_obligation("fake.alpha", required_fields=["person_id"]), + make_obligation("fake.beta", required_fields=["person_id"]), + ], + ) + normalized = TargetModelNormalizer.normalize(adapter, selected_refs=[a.ref]) + assert len(normalized.entities) == 1 + assert normalized.entities[0].ref.qualified_name == "fake.alpha" diff --git a/tests/unit/targets/test_protocol.py b/tests/unit/targets/test_protocol.py new file mode 100644 index 0000000..5850d22 --- /dev/null +++ b/tests/unit/targets/test_protocol.py @@ -0,0 +1,65 @@ +"""Protocol surface tests for TargetOntologyAdapter.""" + +from __future__ import annotations + +import pytest + +from sema.targets import ( + TargetOntologyAdapter, + TargetOntologyAdapterMixin, + register_target_adapter, +) +from sema.targets.base import REQUIRED_METHODS + +pytestmark = pytest.mark.unit + + +def test_fake_adapter_satisfies_runtime_protocol(fake_adapter_cls: type) -> None: + adapter = fake_adapter_cls() + assert isinstance(adapter, TargetOntologyAdapter) + + +def test_register_rejects_class_missing_required_method() -> None: + class IncompleteAdapter: + def describe(self) -> None: + return None + + def discover_entities(self) -> list[None]: + return [] + + def load_entity(self, ref: object) -> None: + return None + + def load_vocabulary_bindings(self, ref: object) -> list[None]: + return [] + + def load_context_card(self, ref: object) -> None: + return None + + with pytest.raises(TypeError) as excinfo: + register_target_adapter( + adapter_id="incomplete", + target_model_id="bad", + supported_versions="", + )(IncompleteAdapter) + assert "load_obligation" in str(excinfo.value) + + +def test_required_methods_constant_matches_protocol() -> None: + expected = { + "describe", + "discover_entities", + "load_entity", + "load_obligation", + "load_vocabulary_bindings", + "load_context_card", + } + assert set(REQUIRED_METHODS) == expected + + +def test_default_iter_terms_raises_not_implemented() -> None: + class BareMixinAdapter(TargetOntologyAdapterMixin): + pass + + with pytest.raises(NotImplementedError, match="EXTERNAL"): + next(BareMixinAdapter().iter_terms(vocabulary_ref=None)) # type: ignore[arg-type] diff --git a/tests/unit/targets/test_registry.py b/tests/unit/targets/test_registry.py new file mode 100644 index 0000000..7cde274 --- /dev/null +++ b/tests/unit/targets/test_registry.py @@ -0,0 +1,174 @@ +"""Version-aware adapter registry tests.""" + +from __future__ import annotations + +import pytest + +pytestmark = pytest.mark.unit + +from sema.targets import ( + AdapterRegistryError, + AmbiguousAdapterError, + NoMatchingAdapterError, + OverlappingVersionRangeError, + UnknownAdapterError, + discover_entry_points, + get, + list_registered, + register_target_adapter, +) + + +def _make_subclass(base: type, name: str) -> type: + return type(name, (base,), {}) + + +def test_single_match_resolves(fake_adapter_cls: type) -> None: + register_target_adapter( + adapter_id="manifest", target_model_id="acris-nyc", supported_versions="" + )(fake_adapter_cls) + assert get("manifest", "acris-nyc") is fake_adapter_cls + assert get("manifest", "acris-nyc", "9.9.9") is fake_adapter_cls + + +def test_two_non_overlapping_ranges_resolve_per_range(fake_adapter_cls: type) -> None: + a = _make_subclass(fake_adapter_cls, "OmopV5_0") + b = _make_subclass(fake_adapter_cls, "OmopV5_4") + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.0,<5.4" + )(a) + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.4,<6.0" + )(b) + assert get("omop_cdm", "omop-cdm", "5.3.1") is a + assert get("omop_cdm", "omop-cdm", "5.4") is b + with pytest.raises(AmbiguousAdapterError): + get("omop_cdm", "omop-cdm") + + +def test_overlap_detected_when_ranges_share_interior_only( + fake_adapter_cls: type, +) -> None: + """Two non-boundary-aligned ranges that share an interior version + must trip overlap rejection. e.g. (>=5.0,<5.4) vs (>=5.3,<6.0) + overlap on [5.3, 5.4).""" + a = _make_subclass(fake_adapter_cls, "A") + b = _make_subclass(fake_adapter_cls, "B") + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.0,<5.4" + )(a) + with pytest.raises(OverlappingVersionRangeError): + register_target_adapter( + adapter_id="omop_cdm", + target_model_id="omop-cdm", + supported_versions=">=5.3,<6.0", + )(b) + + +def test_overlap_detected_for_strict_inequality_ranges( + fake_adapter_cls: type, +) -> None: + """`>1,<3` and `>2,<4` overlap on (2, 3); both endpoints exclusive.""" + a = _make_subclass(fake_adapter_cls, "A") + b = _make_subclass(fake_adapter_cls, "B") + register_target_adapter( + adapter_id="x", target_model_id="m", supported_versions=">1,<3" + )(a) + with pytest.raises(OverlappingVersionRangeError): + register_target_adapter( + adapter_id="x", target_model_id="m", supported_versions=">2,<4" + )(b) + + +def test_overlap_with_open_lower_bound(fake_adapter_cls: type) -> None: + """`<5.0` and `>=4.0,<6.0` overlap on [4.0, 5.0).""" + a = _make_subclass(fake_adapter_cls, "A") + b = _make_subclass(fake_adapter_cls, "B") + register_target_adapter( + adapter_id="x", target_model_id="m", supported_versions="<5.0" + )(a) + with pytest.raises(OverlappingVersionRangeError): + register_target_adapter( + adapter_id="x", target_model_id="m", supported_versions=">=4.0,<6.0" + )(b) + + +def test_overlapping_ranges_rejected_at_registration(fake_adapter_cls: type) -> None: + a = _make_subclass(fake_adapter_cls, "A") + b = _make_subclass(fake_adapter_cls, "B") + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.0,<6.0" + )(a) + with pytest.raises(OverlappingVersionRangeError): + register_target_adapter( + adapter_id="omop_cdm", + target_model_id="omop-cdm", + supported_versions=">=5.4,<7.0", + )(b) + + +def test_no_matching_version_raises(fake_adapter_cls: type) -> None: + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.4,<6.0" + )(fake_adapter_cls) + with pytest.raises(NoMatchingAdapterError): + get("omop_cdm", "omop-cdm", "5.3") + + +def test_unknown_adapter_id_raises(fake_adapter_cls: type) -> None: + register_target_adapter( + adapter_id="manifest", target_model_id="acris-nyc", supported_versions="" + )(fake_adapter_cls) + with pytest.raises(UnknownAdapterError) as excinfo: + get("nonexistent", "anything") + assert "manifest" in str(excinfo.value) + + +def test_unknown_target_model_id_raises(fake_adapter_cls: type) -> None: + register_target_adapter( + adapter_id="manifest", target_model_id="acris-nyc", supported_versions="" + )(fake_adapter_cls) + with pytest.raises(UnknownAdapterError) as excinfo: + get("manifest", "no-such-model") + assert "acris-nyc" in str(excinfo.value) + + +def test_literal_star_wildcard_rejected(fake_adapter_cls: type) -> None: + with pytest.raises(AdapterRegistryError, match="empty string"): + register_target_adapter( + adapter_id="manifest", + target_model_id="any", + supported_versions="*", + )(fake_adapter_cls) + + +def test_list_returns_sorted_tuples(fake_adapter_cls: type) -> None: + a = _make_subclass(fake_adapter_cls, "A") + b = _make_subclass(fake_adapter_cls, "B") + register_target_adapter(adapter_id="z_adapter", target_model_id="z", supported_versions="")(a) + register_target_adapter(adapter_id="a_adapter", target_model_id="a", supported_versions="")(b) + rows = list_registered() + assert rows == sorted(rows) + assert ("a_adapter", "a", "") in rows + assert ("z_adapter", "z", "") in rows + + +def test_omitted_version_with_multiple_registrations_raises_ambiguous( + fake_adapter_cls: type, +) -> None: + a = _make_subclass(fake_adapter_cls, "A") + b = _make_subclass(fake_adapter_cls, "B") + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.0,<5.4" + )(a) + register_target_adapter( + adapter_id="omop_cdm", target_model_id="omop-cdm", supported_versions=">=5.4,<6.0" + )(b) + with pytest.raises(AmbiguousAdapterError): + get("omop_cdm", "omop-cdm") + + +def test_discover_entry_points_returns_classes_without_registering() -> None: + found = discover_entry_points("sema.target_adapters") + assert isinstance(found, list) + assert list_registered() == [] diff --git a/tests/unit/targets/test_target_loader_migrations.py b/tests/unit/targets/test_target_loader_migrations.py new file mode 100644 index 0000000..9584634 --- /dev/null +++ b/tests/unit/targets/test_target_loader_migrations.py @@ -0,0 +1,138 @@ +"""Unit checks for the target-loader Cypher migration shape (5.12, 5.13).""" + +from __future__ import annotations + +import pytest + +from sema.graph.target_loader_migrations import cypher_down, cypher_up + +pytestmark = pytest.mark.unit + + +def test_up_declares_enrichment_decision_uniqueness() -> None: + stmts = cypher_up() + assert any( + "CONSTRAINT enrichment_decision_unique" in s + and "EnrichmentDecision" in s + and "target_schema_snapshot_hash" in s + and "n.entity_ref" in s + for s in stmts + ) + + +def test_up_declares_hash_versioned_entity_uniqueness() -> None: + stmts = cypher_up() + assert any( + "target_entity_hash_unique" in s + and "Entity" in s + and "target_schema_snapshot_hash" in s + and "qualified_name" in s + for s in stmts + ) + + +def test_up_declares_hash_versioned_property_uniqueness() -> None: + stmts = cypher_up() + assert any( + "target_property_hash_unique" in s + and "Property" in s + and "parent_entity_qualified_name" in s + and "target_schema_snapshot_hash" in s + for s in stmts + ) + + +def test_up_declares_hash_versioned_obligation_uniqueness() -> None: + stmts = cypher_up() + assert any( + "target_obligation_hash_unique" in s + and "TargetObligation" in s + and "target_schema_snapshot_hash" in s + and "target_entity" in s + for s in stmts + ) + + +def test_up_declares_indexes_on_five_facets_status() -> None: + stmts = cypher_up() + facets = ( + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + ) + for f in facets: + assert any( + f"entity_enrichment_{f}_status" in s + and f"enrichment_{f}_status" in s + and "Entity" in s + for s in stmts + ), f"missing index for facet {f}" + + +def test_up_declares_is_current_indexes() -> None: + stmts = cypher_up() + assert any("entity_is_current" in s for s in stmts) + assert any("property_is_current" in s for s in stmts) + + +def test_up_declares_property_kind_index() -> None: + stmts = cypher_up() + assert any("property_property_kind" in s for s in stmts) + + +def test_down_drops_every_constraint_and_index_added_by_up() -> None: + up = cypher_up() + down = cypher_down() + drops = "\n".join(down) + assert "DROP CONSTRAINT enrichment_decision_unique" in drops + assert "DROP CONSTRAINT target_entity_hash_unique" in drops + assert "DROP CONSTRAINT target_property_hash_unique" in drops + assert "DROP CONSTRAINT target_obligation_hash_unique" in drops + for f in ( + "structure", + "obligations", + "vocabulary_bindings", + "semantic_aliases", + "terms", + ): + assert f"DROP INDEX entity_enrichment_{f}_status" in drops + assert "DROP INDEX entity_is_current" in drops + assert "DROP INDEX property_property_kind" in drops + assert "DROP INDEX property_is_current" in drops + assert "MATCH (n:EnrichmentDecision)" in drops + assert len(up) > 0 + + +def test_up_declares_term_constraint_vocab_binding_card_uniqueness() -> None: + stmts = "\n".join(cypher_up()) + assert "target_term_hash_unique" in stmts and "Term" in stmts + assert "target_constraint_hash_unique" in stmts and "Constraint" in stmts + assert "target_vocab_binding_hash_unique" in stmts + assert "target_context_card_hash_unique" in stmts + + +def test_up_declares_is_current_indexes_for_all_versioned_labels() -> None: + stmts = "\n".join(cypher_up()) + for name in ( + "target_obligation_is_current", + "target_term_is_current", + "target_constraint_is_current", + "target_vocab_binding_is_current", + "target_context_card_is_current", + ): + assert name in stmts + + +def test_down_drops_new_uniqueness_constraints_and_labels() -> None: + drops = "\n".join(cypher_down()) + for c in ( + "DROP CONSTRAINT target_term_hash_unique", + "DROP CONSTRAINT target_constraint_hash_unique", + "DROP CONSTRAINT target_vocab_binding_hash_unique", + "DROP CONSTRAINT target_context_card_hash_unique", + "MATCH (n:VocabularyBinding)", + "MATCH (n:ContextCard)", + ): + assert c in drops diff --git a/tests/unit/test_cli_target.py b/tests/unit/test_cli_target.py new file mode 100644 index 0000000..0d42654 --- /dev/null +++ b/tests/unit/test_cli_target.py @@ -0,0 +1,80 @@ +"""CLI tests for `sema target load --manifest `.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from sema.cli_target import target_group +from sema.targets.registry import _clear_for_tests + +pytestmark = pytest.mark.unit + +_GOLDEN = ( + Path(__file__).resolve().parents[1] + / "unit" + / "targets" + / "fixtures" + / "golden_manifest.yaml" +) + + +@pytest.fixture(autouse=True) +def _isolate_registry(): + _clear_for_tests() + yield + _clear_for_tests() + + +def test_target_load_in_memory_writer_prints_summary() -> None: + runner = CliRunner() + result = runner.invoke( + target_group, ["load", "--manifest", str(_GOLDEN), "--writer", "in-memory"] + ) + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + assert payload["target_model_id"] == "golden-target" + assert payload["target_schema_snapshot_hash"] + assert "entities" in payload and len(payload["entities"]) > 0 + assert "context_cards" in payload and len(payload["context_cards"]) > 0 + + +def test_target_load_rejects_missing_manifest() -> None: + runner = CliRunner() + result = runner.invoke( + target_group, ["load", "--manifest", "/no/such/file.yaml", "--writer", "in-memory"] + ) + assert result.exit_code != 0 + assert "manifest" in result.output.lower() or "no such" in result.output.lower() + + +def test_target_load_default_writer_is_in_memory_when_no_neo4j_flags() -> None: + runner = CliRunner() + result = runner.invoke(target_group, ["load", "--manifest", str(_GOLDEN)]) + assert result.exit_code == 0, result.output + + +def test_target_load_skip_facets_threaded() -> None: + runner = CliRunner() + result = runner.invoke( + target_group, + [ + "load", + "--manifest", + str(_GOLDEN), + "--writer", + "in-memory", + "--skip-facet", + "semantic_aliases", + ], + ) + assert result.exit_code == 0 + payload = json.loads(result.output) + statuses = [ + d["decisions"]["semantic_aliases"]["status"] + for d in payload["enrichment_decisions"] + ] + assert any(s == "required_skipped" for s in statuses) diff --git a/uv.lock b/uv.lock index ea6a8ca..b7b0a69 100644 --- a/uv.lock +++ b/uv.lock @@ -4129,10 +4129,12 @@ dependencies = [ { name = "langgraph" }, { name = "loguru" }, { name = "neo4j" }, + { name = "packaging" }, { name = "pyarrow" }, { name = "pydantic" }, { name = "pydantic-settings" }, { name = "python-dotenv" }, + { name = "pyyaml" }, { name = "rich" }, { name = "sentence-transformers" }, { name = "sqlglot" }, @@ -4162,10 +4164,12 @@ requires-dist = [ { name = "langgraph", specifier = ">=0.2.0" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "neo4j", specifier = ">=5.0.0" }, + { name = "packaging", specifier = ">=24.0" }, { name = "pyarrow", specifier = ">=14.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pydantic-settings", specifier = ">=2.0.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, { name = "rich", specifier = ">=13.0.0" }, { name = "sentence-transformers", specifier = ">=3.0.0" }, { name = "sqlglot", specifier = ">=25.0.0" },