From 156f641df03a433854d53f6bf8370d3749b66f03 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 15:42:20 +0300 Subject: [PATCH 01/13] =?UTF-8?q?Add=20bidirectional=20Honeydew=20?= =?UTF-8?q?=E2=86=94=20OSI=20converter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements OSI → Honeydew and Honeydew → OSI conversion with full round-trip fidelity. Relationship names are stored natively on the relation, and relations are always placed on the many side. Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/README.md | 68 ++ converters/honeydew/requirements.txt | 2 + .../honeydew/src/honeydew_osi_converter.py | 946 ++++++++++++++++++ .../tests/test_honeydew_osi_converter.py | 888 ++++++++++++++++ 4 files changed, 1904 insertions(+) create mode 100644 converters/honeydew/README.md create mode 100644 converters/honeydew/requirements.txt create mode 100644 converters/honeydew/src/honeydew_osi_converter.py create mode 100644 converters/honeydew/tests/test_honeydew_osi_converter.py diff --git a/converters/honeydew/README.md b/converters/honeydew/README.md new file mode 100644 index 0000000..eb0e54b --- /dev/null +++ b/converters/honeydew/README.md @@ -0,0 +1,68 @@ +# OSI ↔ Honeydew Converter + +Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models and [Honeydew](https://docs.honeydew.ai) workspace YAML. + +## Overview + +| Direction | Input | Output | +|-----------|-------|--------| +| `osi-to-honeydew` | Single OSI YAML file | Honeydew workspace directory | +| `honeydew-to-osi` | Honeydew workspace directory | Single OSI YAML file | + +### OSI → Honeydew mapping + +| OSI concept | Honeydew concept | +|-------------|-----------------| +| `semantic_model.name` | `workspace.yml name` | +| `dataset` | Entity + dataset files under `schema//` | +| `dataset.source` | `dataset.sql` | +| `dataset.primary_key` | `entity.keys` | +| Simple column field | `dataset.attributes` entry | +| Computed field expression | `calculated_attribute` YAML | +| `relationship` (from → to) | `entity.relations` on the "from" entity (`rel_type: many-to-one`) | +| `metric` | `metric` YAML (assigned to entity by expression parse) | + +### Honeydew → OSI mapping + +| Honeydew concept | OSI concept | +|-----------------|-------------| +| `workspace.name` | `semantic_model.name` | +| Entity + primary dataset | `dataset` | +| `entity.keys` | `dataset.primary_key` | +| `dataset.attributes` (columns) | `fields` with `ANSI_SQL` expression = column name | +| `calculated_attribute` SQL | `fields` with `ANSI_SQL` expression + `HONEYDEW` custom extension | +| `entity.relations` (`many-to-one`) | `relationship` with `from` = this entity | +| `entity.relations` (`one-to-many`) | `relationship` with `from` = target entity | +| `metric.sql` | `metric` expression in `ANSI_SQL` dialect | + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Usage + +```bash +# OSI YAML → Honeydew workspace directory +python src/honeydew_osi_converter.py osi-to-honeydew -i input.yaml -o output_dir/ + +# Honeydew workspace directory → OSI YAML +python src/honeydew_osi_converter.py honeydew-to-osi -i workspace_dir/ -o output.yaml +``` + +## Tests + +```bash +python -m pytest tests/ +``` + +## Limitations + +- **One dataset per entity**: The converter maps each OSI dataset to a single Honeydew entity with one source dataset. Multiple datasets per entity are not generated. +- **Datatype inference**: OSI fields have no explicit datatype; the converter infers Honeydew datatypes from the `dimension.is_time` flag (`timestamp`) and the presence/absence of the `dimension` key (`string` vs `number`). +- **Honeydew SQL expressions**: Calculated attributes and metrics use Honeydew's `entity.attribute` reference syntax. These are exported as `ANSI_SQL` dialect expressions in OSI; they remain valid for round-tripping but may not run on other databases without adaptation. +- **Filters**: Honeydew `filter` objects have no OSI equivalent and are not exported. +- **Perspectives and domains**: Not converted (no OSI equivalent). +- **Connection expressions** (`connection_expr`): Preserved in `HONEYDEW` custom extensions on the OSI relationship. +- **`ai_context`**: OSI `ai_context` fields (synonyms, instructions) are dropped during OSI → Honeydew conversion (no native Honeydew equivalent). Honeydew `description` fields are mapped to OSI `description`. diff --git a/converters/honeydew/requirements.txt b/converters/honeydew/requirements.txt new file mode 100644 index 0000000..2f29b5b --- /dev/null +++ b/converters/honeydew/requirements.txt @@ -0,0 +1,2 @@ +PyYAML>=5.0 +pytest>=7.0 diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py new file mode 100644 index 0000000..0fb9e0a --- /dev/null +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -0,0 +1,946 @@ +""" +Bidirectional converter between OSI and Honeydew semantic model formats. + +OSI → Honeydew: Converts a single OSI YAML file into a Honeydew workspace + directory (multiple YAML files per entity). + +Honeydew → OSI: Reads a Honeydew workspace directory and produces an OSI YAML. + +Usage: + python honeydew_osi_converter.py osi-to-honeydew -i input.yaml -o output_dir/ + python honeydew_osi_converter.py honeydew-to-osi -i workspace_dir/ -o output.yaml +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import sys +import warnings +from typing import Any + +import yaml + +SUPPORTED_OSI_VERSION = "0.2.0.dev0" +HONEYDEW_VENDOR = "HONEYDEW" +_OSI_METADATA_SECTION = "osi" + + +class HoneydewConversionError(Exception): + """Raised when conversion between OSI and Honeydew fails.""" + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI → Honeydew +# ───────────────────────────────────────────────────────────────────────────── + + +def convert_osi_to_honeydew(osi_yaml_str: str) -> dict[str, str]: + """Convert an OSI YAML string to a Honeydew workspace file tree. + + Returns a dict mapping relative file paths to their YAML content strings. + The caller writes these to disk under the desired output directory. + + Honeydew workspace structure produced:: + + workspace.yml + schema// + .yml + datasets/.yml + attributes/.yml (computed fields only) + metrics/.yml + + OSI fields with no direct Honeydew equivalent (``ai_context``, + ``unique_keys``, non-Honeydew ``custom_extensions``, relationship + ``name``) are stored in the Honeydew ``metadata`` section under a section + named ``"osi"`` so they can be recovered on the return trip. + + Args: + osi_yaml_str: OSI YAML document as a string. + + Returns: + Dict of {relative_path: yaml_content}. + + Raises: + HoneydewConversionError: On invalid or unsupported input. + """ + root = yaml.safe_load(osi_yaml_str) + if not isinstance(root, dict): + raise HoneydewConversionError("Invalid OSI YAML: expected a mapping at the root") + + version_str = str(root.get("version", "")) + if version_str != SUPPORTED_OSI_VERSION: + raise HoneydewConversionError( + f"Unsupported OSI version '{version_str}'. Supported: {SUPPORTED_OSI_VERSION}" + ) + + semantic_models = root.get("semantic_model") + if not isinstance(semantic_models, list) or not semantic_models: + raise HoneydewConversionError("'semantic_model' must be a non-empty list") + + if len(semantic_models) > 1: + warnings.warn( + f"OSI YAML contains {len(semantic_models)} semantic models; " + "only the first will be converted" + ) + + return _model_to_files(semantic_models[0]) + + +def _model_to_files(sm: dict[str, Any]) -> dict[str, str]: + name = sm.get("name") + if not name: + raise HoneydewConversionError("Missing 'name' in semantic model") + + files: dict[str, str] = {} + + workspace: dict[str, Any] = {"type": "workspace", "name": name} + if sm.get("description"): + workspace["description"] = sm["description"] + + # Preserve model-level ai_context and non-HONEYDEW custom_extensions + model_ai_ctx = sm.get("ai_context") + model_ext = [e for e in (sm.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + ws_meta = _build_osi_metadata(ai_context=model_ai_ctx, custom_extensions=model_ext or None) + if ws_meta: + workspace["metadata"] = [ws_meta] + + files["workspace.yml"] = _dump(workspace) + + datasets = sm.get("datasets") or [] + metrics = sm.get("metrics") or [] + relationships = sm.get("relationships") or [] + + entity_names = [ds["name"] for ds in datasets if ds.get("name")] + + # Group OSI relationships by from-entity + rel_by_entity: dict[str, list[dict[str, Any]]] = {} + for rel in relationships: + from_ds = rel.get("from") + if from_ds: + rel_by_entity.setdefault(from_ds, []).append(rel) + + # Assign OSI metrics to entities (honours HONEYDEW entity hint for round-trips) + metric_by_entity = _assign_metrics_to_entities(metrics, entity_names) + + for ds in datasets: + entity_name = ds.get("name") + if not entity_name: + raise HoneydewConversionError("Dataset missing 'name'") + files.update( + _dataset_to_files( + ds, + rel_by_entity.get(entity_name, []), + metric_by_entity.get(entity_name, []), + ) + ) + + return files + + +def _dataset_to_files( + ds: dict[str, Any], + relations: list[dict[str, Any]], + metrics: list[dict[str, Any]], +) -> dict[str, str]: + entity_name = ds["name"] + base = f"schema/{entity_name}" + files: dict[str, str] = {} + + primary_key = ds.get("primary_key") or [] + unique_keys = ds.get("unique_keys") + description = ds.get("description") + ai_context = ds.get("ai_context") + fields = ds.get("fields") or [] + ds_ext = [e for e in (ds.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + + # ── entity YAML ──────────────────────────────────────────────────────────── + entity_dict: dict[str, Any] = {"type": "entity", "name": entity_name} + if description: + entity_dict["description"] = description + if primary_key: + entity_dict["keys"] = list(primary_key) + entity_dict["key_dataset"] = entity_name + + honeydew_relations = [] + for rel in relations: + hr = _osi_relation_to_honeydew(rel) + if hr is not None: + honeydew_relations.append(hr) + entity_dict["relations"] = honeydew_relations + + # Preserve OSI fields that have no Honeydew native equivalent + entity_meta = _build_osi_metadata( + ai_context=ai_context, + unique_keys=unique_keys, + custom_extensions=ds_ext or None, + ) + if entity_meta: + entity_dict["metadata"] = [entity_meta] + + files[f"{base}/{entity_name}.yml"] = _dump(entity_dict) + + # ── classify fields into dataset attributes vs calculated attributes ──────── + dataset_attrs: list[dict[str, Any]] = [] + calc_attrs: list[dict[str, Any]] = [] + + for field in fields: + field_name = field.get("name") + if not field_name: + raise HoneydewConversionError(f"Field missing 'name' in dataset '{entity_name}'") + + expr = _pick_ansi_expression(field.get("expression"), field_name) + if expr is None: + continue + + datatype = _osi_field_to_honeydew_datatype(field) + field_desc = field.get("description") + field_label = field.get("label") + field_ai_ctx = field.get("ai_context") + field_ext = [e for e in (field.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + + # Merge ai_context instructions into description; keep full object in metadata + effective_desc = field_desc + if isinstance(field_ai_ctx, str) and field_ai_ctx: + effective_desc = f"{field_desc}\n{field_ai_ctx}" if field_desc else field_ai_ctx + elif isinstance(field_ai_ctx, dict) and field_ai_ctx.get("instructions"): + instr = field_ai_ctx["instructions"] + effective_desc = f"{field_desc}\n{instr}" if field_desc else instr + + # Build labels: OSI label + ai_context synonyms + labels: list[str] = [] + if field_label: + labels.append(field_label) + if isinstance(field_ai_ctx, dict): + for syn in (field_ai_ctx.get("synonyms") or []): + if syn not in labels: + labels.append(syn) + + field_meta = _build_osi_metadata( + ai_context=field_ai_ctx if isinstance(field_ai_ctx, dict) else None, + custom_extensions=field_ext or None, + ) + + if _is_simple_identifier(expr): + attr: dict[str, Any] = {"column": expr, "name": field_name, "datatype": datatype} + if effective_desc: + attr["description"] = effective_desc + if labels: + attr["labels"] = labels + if field_meta: + attr["metadata"] = [field_meta] + dataset_attrs.append(attr) + else: + calc: dict[str, Any] = { + "type": "calculated_attribute", + "entity": entity_name, + "name": field_name, + "datatype": datatype, + "sql": expr, + } + if effective_desc: + calc["description"] = effective_desc + if labels: + calc["labels"] = labels + if field_meta: + calc["metadata"] = [field_meta] + calc_attrs.append(calc) + + # ── dataset YAML ─────────────────────────────────────────────────────────── + source_sql, dataset_type = _parse_osi_source(ds.get("source", "")) + dataset_dict: dict[str, Any] = { + "type": "dataset", + "entity": entity_name, + "name": entity_name, + "sql": source_sql, + "dataset_type": dataset_type, + "attributes": dataset_attrs, + } + if description: + dataset_dict["description"] = description + + files[f"{base}/datasets/{entity_name}.yml"] = _dump(dataset_dict) + + # ── calculated_attribute YAMLs ───────────────────────────────────────────── + for calc in calc_attrs: + files[f"{base}/attributes/{calc['name']}.yml"] = _dump(calc) + + # ── metric YAMLs ──────────────────────────────────────────────────────────── + for metric in metrics: + mname = metric.get("name") + if not mname: + continue + mexpr = _pick_ansi_expression(metric.get("expression"), mname) + if mexpr is None: + continue + + metric_dict: dict[str, Any] = { + "type": "metric", + "entity": entity_name, + "name": mname, + "datatype": "number", + "sql": mexpr, + } + if metric.get("description"): + metric_dict["description"] = metric["description"] + + metric_ai_ctx = metric.get("ai_context") + if isinstance(metric_ai_ctx, str) and metric_ai_ctx: + existing = metric_dict.get("description", "") + metric_dict["description"] = f"{existing}\n{metric_ai_ctx}".strip() if existing else metric_ai_ctx + + metric_ext = [e for e in (metric.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + metric_meta = _build_osi_metadata( + ai_context=metric_ai_ctx if isinstance(metric_ai_ctx, dict) else None, + custom_extensions=metric_ext or None, + ) + if metric_meta: + metric_dict["metadata"] = [metric_meta] + + files[f"{base}/metrics/{mname}.yml"] = _dump(metric_dict) + + return files + + +def _osi_relation_to_honeydew(rel: dict[str, Any]) -> dict[str, Any] | None: + rel_name = rel.get("name", "") + to_ds = rel.get("to") + if not to_ds: + warnings.warn(f"Relationship '{rel_name}' missing 'to', skipping") + return None + + from_cols = rel.get("from_columns") or [] + to_cols = rel.get("to_columns") or [] + + if len(from_cols) != len(to_cols): + raise HoneydewConversionError( + f"Relationship '{rel_name}': from_columns and to_columns length mismatch " + f"({len(from_cols)} vs {len(to_cols)})" + ) + + honeydew_rel: dict[str, Any] = { + "target_entity": to_ds, + "rel_type": "many-to-one", + } + if rel.get("name"): + honeydew_rel["name"] = rel["name"] + if from_cols: + honeydew_rel["connection"] = [ + {"src_field": fc, "target_field": tc} + for fc, tc in zip(from_cols, to_cols) + ] + return honeydew_rel + + +def _pick_ansi_expression(expression: Any, field_name: str) -> str | None: + """Select the ANSI_SQL expression; fall back to first available dialect.""" + if not isinstance(expression, dict): + return None + dialects = expression.get("dialects") or [] + if not dialects: + return None + + ansi_expr = None + first_expr = None + + for d in dialects: + dialect = (d.get("dialect") or "").upper() + expr = d.get("expression") + if first_expr is None: + first_expr = expr + if dialect == "ANSI_SQL" and ansi_expr is None: + ansi_expr = expr + + if ansi_expr is not None: + return ansi_expr + + if first_expr is not None: + warnings.warn(f"'{field_name}': no ANSI_SQL dialect found; using first available") + return first_expr + + return None + + +def _osi_field_to_honeydew_datatype(field: dict[str, Any]) -> str: + dimension = field.get("dimension") + if isinstance(dimension, dict) and dimension.get("is_time"): + return "timestamp" + if dimension is not None: + return "string" + return "number" + + +def _is_simple_identifier(expr: str) -> bool: + return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", expr.strip())) + + +def _parse_osi_source(source: str) -> tuple[str, str]: + source = (source or "").strip() + if not source: + return ("", "table") + upper = source.upper() + if upper.startswith(("SELECT ", "SELECT\n", "SELECT\t", "WITH ", "WITH\n", "WITH\t")): + return (source, "sql") + return (source, "table") + + +def _assign_metrics_to_entities( + metrics: list[dict[str, Any]], + entity_names: list[str], +) -> dict[str, list[dict[str, Any]]]: + """Assign each OSI metric to the most appropriate Honeydew entity. + + Priority: + 1. HONEYDEW ``custom_extension`` entity hint (preserves round-trip placement) + 2. First ``entity.column`` pattern in the ANSI_SQL expression + 3. First entity in the model (with a warning) + """ + entity_set = set(entity_names) + result: dict[str, list[dict[str, Any]]] = {} + + for metric in metrics: + mname = metric.get("name", "") + + # Priority 1: HONEYDEW entity hint (set during Honeydew → OSI) + hinted = _get_honeydew_extension(metric).get("entity") + if hinted and hinted in entity_set: + result.setdefault(hinted, []).append(metric) + continue + + # Priority 2: expression scan + expr_dict = metric.get("expression") or {} + dialects = expr_dict.get("dialects") or [] if isinstance(expr_dict, dict) else [] + expr_str = "" + for d in dialects: + if (d.get("dialect") or "").upper() == "ANSI_SQL": + expr_str = d.get("expression") or "" + break + if not expr_str and dialects: + expr_str = dialects[0].get("expression") or "" + + assigned = _find_entity_in_expression(expr_str, entity_set) + + # Priority 3: fallback + if assigned is None: + if entity_names: + assigned = entity_names[0] + warnings.warn( + f"Metric '{mname}': no entity reference found in expression; " + f"assigning to '{assigned}'" + ) + else: + warnings.warn(f"Metric '{mname}': no entities to assign to, skipping") + continue + + result.setdefault(assigned, []).append(metric) + + return result + + +def _find_entity_in_expression(expr: str, entity_names: set[str]) -> str | None: + for match in re.finditer(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\.([a-zA-Z_][a-zA-Z0-9_]*)\b", expr): + if match.group(1) in entity_names: + return match.group(1) + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Honeydew → OSI +# ───────────────────────────────────────────────────────────────────────────── + + +def convert_honeydew_to_osi(workspace_dir: str) -> str: + """Convert a Honeydew workspace directory to an OSI YAML string. + + Reads workspace.yml and all entity subdirectories under schema/. Honeydew + fields with no OSI equivalent (``owner``, ``display_name``, ``hidden``, + ``format_string``, ``timegrain``, attribute ``labels``) are preserved in a + HONEYDEW ``custom_extension`` so they survive a round-trip back to Honeydew. + + Args: + workspace_dir: Path to the Honeydew workspace root. + + Returns: + OSI YAML document string. + + Raises: + HoneydewConversionError: On missing workspace.yml. + """ + workspace_path = os.path.join(workspace_dir, "workspace.yml") + if not os.path.exists(workspace_path): + raise HoneydewConversionError(f"workspace.yml not found in '{workspace_dir}'") + + with open(workspace_path) as f: + workspace = yaml.safe_load(f) or {} + + model_name = workspace.get("name") or os.path.basename(workspace_dir.rstrip("/\\")) + model_description = workspace.get("description") + ws_osi_meta = _read_osi_metadata(workspace) + + schema_dir = os.path.join(workspace_dir, "schema") + entity_dirs: list[str] = [] + if os.path.isdir(schema_dir): + entity_dirs = sorted( + d for d in os.listdir(schema_dir) + if os.path.isdir(os.path.join(schema_dir, d)) + ) + + osi_datasets: list[dict[str, Any]] = [] + osi_relationships: list[dict[str, Any]] = [] + osi_metrics: list[dict[str, Any]] = [] + seen_relationships: set[tuple] = set() + + for entity_name in entity_dirs: + entity_dir = os.path.join(schema_dir, entity_name) + entity_data = _read_entity_dir(entity_dir, entity_name) + + osi_datasets.append(_entity_to_osi_dataset(entity_data)) + + for rel in entity_data["relations"]: + osi_rel = _honeydew_relation_to_osi( + rel, entity_name, seen_relationships + ) + if osi_rel is not None: + osi_relationships.append(osi_rel) + + for metric in entity_data["metrics"]: + osi_m = _honeydew_metric_to_osi(metric, entity_name) + if osi_m is not None: + osi_metrics.append(osi_m) + + sm: dict[str, Any] = {"name": model_name, "datasets": osi_datasets} + if model_description: + sm["description"] = str(model_description).strip() + if ws_osi_meta.get("ai_context"): + sm["ai_context"] = ws_osi_meta["ai_context"] + + restored_ws_ext = ws_osi_meta.get("custom_extensions") or [] + if restored_ws_ext: + sm["custom_extensions"] = restored_ws_ext + + if osi_relationships: + sm["relationships"] = osi_relationships + if osi_metrics: + sm["metrics"] = osi_metrics + + root: dict[str, Any] = { + "version": SUPPORTED_OSI_VERSION, + "vendors": [HONEYDEW_VENDOR], + "semantic_model": [sm], + } + return _dump(root) + + +def _read_entity_dir(entity_dir: str, entity_name: str) -> dict[str, Any]: + data: dict[str, Any] = { + "name": entity_name, + "description": None, + "keys": [], + "key_dataset": None, + "relations": [], + "primary_dataset": None, + "calculated_attributes": [], + "metrics": [], + "osi_meta": {}, + "honeydew_extra": {}, + } + + entity_yml = os.path.join(entity_dir, f"{entity_name}.yml") + if os.path.exists(entity_yml): + with open(entity_yml) as f: + ey = yaml.safe_load(f) or {} + data["keys"] = _coerce_list(ey.get("keys")) + data["description"] = ey.get("description") + data["key_dataset"] = ey.get("key_dataset") + data["relations"] = ey.get("relations") or [] + data["osi_meta"] = _read_osi_metadata(ey) + data["honeydew_extra"] = { + k: ey[k] for k in ("owner", "display_name", "hidden", "folder", "labels") + if k in ey + } + + datasets_dir = os.path.join(entity_dir, "datasets") + if os.path.isdir(datasets_dir): + all_ds: list[dict[str, Any]] = [] + for fn in sorted(os.listdir(datasets_dir)): + if fn.endswith((".yml", ".yaml")): + with open(os.path.join(datasets_dir, fn)) as f: + all_ds.append(yaml.safe_load(f) or {}) + for ds in all_ds: + if ds.get("name") == data["key_dataset"] or data["primary_dataset"] is None: + data["primary_dataset"] = ds + if ds.get("name") == data["key_dataset"]: + break + + attrs_dir = os.path.join(entity_dir, "attributes") + if os.path.isdir(attrs_dir): + for fn in sorted(os.listdir(attrs_dir)): + if fn.endswith((".yml", ".yaml")): + with open(os.path.join(attrs_dir, fn)) as f: + data["calculated_attributes"].append(yaml.safe_load(f) or {}) + + metrics_dir = os.path.join(entity_dir, "metrics") + if os.path.isdir(metrics_dir): + for fn in sorted(os.listdir(metrics_dir)): + if fn.endswith((".yml", ".yaml")): + with open(os.path.join(metrics_dir, fn)) as f: + data["metrics"].append(yaml.safe_load(f) or {}) + + return data + + +def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: + entity_name = entity_data["name"] + ds: dict[str, Any] = {"name": entity_name} + + if entity_data.get("description"): + ds["description"] = str(entity_data["description"]).strip() + + primary_ds = entity_data.get("primary_dataset") + ds["source"] = (primary_ds.get("sql") or "").strip() if primary_ds else entity_name + + keys = entity_data.get("keys") or [] + if keys: + ds["primary_key"] = list(keys) + + # Restore OSI-only fields preserved in Honeydew metadata + osi_meta = entity_data.get("osi_meta") or {} + if osi_meta.get("ai_context"): + ds["ai_context"] = osi_meta["ai_context"] + if osi_meta.get("unique_keys"): + ds["unique_keys"] = osi_meta["unique_keys"] + + restored_ext = list(osi_meta.get("custom_extensions") or []) + honeydew_extra = entity_data.get("honeydew_extra") or {} + if honeydew_extra: + restored_ext.append({"vendor_name": HONEYDEW_VENDOR, "data": json.dumps(honeydew_extra)}) + if restored_ext: + ds["custom_extensions"] = restored_ext + + # Build fields + fields: list[dict[str, Any]] = [] + seen: set[str] = set() + + if primary_ds: + for attr in primary_ds.get("attributes") or []: + col = attr.get("column") or attr.get("name") or "" + aname = attr.get("name") or col + if not aname or aname in seen: + continue + seen.add(aname) + + field: dict[str, Any] = { + "name": aname, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": col}]}, + } + datatype = attr.get("datatype") or "string" + dim = _honeydew_datatype_to_osi_dimension(datatype) + if dim is not None: + field["dimension"] = dim + if attr.get("description"): + field["description"] = str(attr["description"]).strip() + + attr_osi_meta = _read_osi_metadata(attr) + attr_labels = attr.get("labels") or [] + + # Restore ai_context (structured form takes priority, else build from labels) + if attr_osi_meta.get("ai_context"): + field["ai_context"] = attr_osi_meta["ai_context"] + elif attr_labels: + field["ai_context"] = {"synonyms": list(attr_labels)} + + # Restore label (first Honeydew label maps to OSI label) + if attr_labels: + field["label"] = attr_labels[0] + + # Honeydew-specific metadata → HONEYDEW custom_extension + attr_honeydew_extra = { + k: attr[k] for k in ("display_name", "hidden", "folder", "format_string", "timegrain") + if k in attr + } + if len(attr_labels) > 1: + attr_honeydew_extra["labels"] = attr_labels + + all_ext = list(attr_osi_meta.get("custom_extensions") or []) + if attr_honeydew_extra: + all_ext.append({"vendor_name": HONEYDEW_VENDOR, "data": json.dumps(attr_honeydew_extra)}) + if all_ext: + field["custom_extensions"] = all_ext + + fields.append(field) + + for calc in entity_data.get("calculated_attributes") or []: + aname = calc.get("name") or "" + if not aname or aname in seen: + continue + seen.add(aname) + + sql = (calc.get("sql") or "").strip() + datatype = calc.get("datatype") or "string" + + field = { + "name": aname, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": sql}]}, + } + dim = _honeydew_datatype_to_osi_dimension(datatype) + if dim is not None: + field["dimension"] = dim + if calc.get("description"): + cleaned = str(calc["description"]).strip() + if cleaned: + field["description"] = cleaned + + calc_osi_meta = _read_osi_metadata(calc) + calc_labels = calc.get("labels") or [] + + if calc_osi_meta.get("ai_context"): + field["ai_context"] = calc_osi_meta["ai_context"] + elif calc_labels: + field["ai_context"] = {"synonyms": list(calc_labels)} + + if calc_labels: + field["label"] = calc_labels[0] + + calc_honeydew_extra = { + k: calc[k] for k in ("display_name", "hidden", "folder", "format_string", "timegrain") + if k in calc + } + + all_calc_ext = list(calc_osi_meta.get("custom_extensions") or []) + # Always mark as calculated_attribute so OSI → Honeydew routes it correctly + all_calc_ext.append({ + "vendor_name": HONEYDEW_VENDOR, + "data": json.dumps(dict({"type": "calculated_attribute", "entity": entity_name}, **calc_honeydew_extra)), + }) + field["custom_extensions"] = all_calc_ext + + fields.append(field) + + if fields: + ds["fields"] = fields + + return ds + + +def _honeydew_datatype_to_osi_dimension(datatype: str) -> dict[str, Any] | None: + dt = (datatype or "").lower() + if dt in ("date", "timestamp", "time"): + return {"is_time": True} + if dt in ("bool", "string"): + return {"is_time": False} + return None # number / float → OSI fact (no dimension key) + + +def _honeydew_relation_to_osi( + rel: dict[str, Any], + entity_name: str, + seen: set[tuple], +) -> dict[str, Any] | None: + target = rel.get("target_entity") + if not target: + warnings.warn(f"Entity '{entity_name}': relation missing target_entity, skipping") + return None + + rel_type = (rel.get("rel_type") or "many-to-one").lower() + connection = rel.get("connection") or [] + connection_expr = rel.get("connection_expr") + + if rel_type == "many-to-one": + from_entity, to_entity = entity_name, target + from_cols = [c.get("src_field", "") for c in connection] + to_cols = [c.get("target_field", "") for c in connection] + elif rel_type == "one-to-many": + from_entity, to_entity = target, entity_name + from_cols = [c.get("target_field", "") for c in connection] + to_cols = [c.get("src_field", "") for c in connection] + else: + from_entity, to_entity = entity_name, target + from_cols = [c.get("src_field", "") for c in connection] + to_cols = [c.get("target_field", "") for c in connection] + + dedup_key = (from_entity, to_entity, tuple(from_cols), tuple(to_cols)) + if dedup_key in seen: + return None + seen.add(dedup_key) + + rel_name = rel.get("name") or f"{from_entity}_to_{to_entity}" + + osi_rel: dict[str, Any] = {"name": rel_name, "from": from_entity, "to": to_entity} + if from_cols: + osi_rel["from_columns"] = from_cols + osi_rel["to_columns"] = to_cols + + if connection_expr and not connection: + sql_expr = (connection_expr.get("sql") or "") if isinstance(connection_expr, dict) else str(connection_expr) + osi_rel["custom_extensions"] = [ + {"vendor_name": HONEYDEW_VENDOR, "data": json.dumps({"connection_expr": sql_expr})} + ] + + return osi_rel + + +def _honeydew_metric_to_osi(metric: dict[str, Any], entity_name: str) -> dict[str, Any] | None: + mname = metric.get("name") or "" + if not mname: + return None + + sql = (metric.get("sql") or "").strip() + if not sql: + warnings.warn(f"Metric '{mname}' in entity '{entity_name}' has no SQL, skipping") + return None + + osi_m: dict[str, Any] = { + "name": mname, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": sql}]}, + "custom_extensions": [ + {"vendor_name": HONEYDEW_VENDOR, "data": json.dumps({"entity": entity_name})} + ], + } + + if metric.get("description"): + cleaned = str(metric["description"]).strip() + if cleaned: + osi_m["description"] = cleaned + + metric_osi_meta = _read_osi_metadata(metric) + if metric_osi_meta.get("ai_context"): + osi_m["ai_context"] = metric_osi_meta["ai_context"] + + restored_ext = metric_osi_meta.get("custom_extensions") or [] + if restored_ext: + osi_m["custom_extensions"] = osi_m["custom_extensions"] + list(restored_ext) + + return osi_m + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI metadata helpers — store/restore OSI fields in Honeydew metadata sections +# ───────────────────────────────────────────────────────────────────────────── + + +def _build_osi_metadata( + *, + ai_context: Any = None, + unique_keys: Any = None, + custom_extensions: list | None = None, +) -> dict[str, Any] | None: + """Build a Honeydew metadata entry that stores OSI-only fields for round-tripping.""" + items: list[dict[str, Any]] = [] + + if ai_context is not None: + val = ai_context if isinstance(ai_context, str) else json.dumps(ai_context) + items.append({"name": "ai_context", "value": val}) + if unique_keys: + items.append({"name": "unique_keys", "value": json.dumps(unique_keys)}) + if custom_extensions: + items.append({"name": "custom_extensions", "value": json.dumps(custom_extensions)}) + + if not items: + return None + return {"name": _OSI_METADATA_SECTION, "metadata": items} + + +def _read_osi_metadata(obj: dict[str, Any]) -> dict[str, Any]: + """Read OSI-preserved fields from a Honeydew object's 'osi' metadata section.""" + for section in (obj.get("metadata") or []): + if (section.get("name") or "") != _OSI_METADATA_SECTION: + continue + result: dict[str, Any] = {} + for item in (section.get("metadata") or []): + key = item.get("name") or "" + raw = item.get("value") + if key == "ai_context": + try: + result[key] = json.loads(raw) + except (json.JSONDecodeError, TypeError): + result[key] = raw + elif key in ("unique_keys", "custom_extensions"): + try: + result[key] = json.loads(raw) + except (json.JSONDecodeError, TypeError): + pass + return result + return {} + + +def _get_honeydew_extension(obj: dict[str, Any]) -> dict[str, Any]: + """Extract the HONEYDEW custom_extension data from an OSI object.""" + for ext in (obj.get("custom_extensions") or []): + if ext.get("vendor_name") == HONEYDEW_VENDOR: + try: + return json.loads(ext.get("data") or "{}") + except (json.JSONDecodeError, TypeError): + return {} + return {} + + +# ───────────────────────────────────────────────────────────────────────────── +# Utilities +# ───────────────────────────────────────────────────────────────────────────── + + +def _coerce_list(value: Any) -> list: + if value is None: + return [] + if isinstance(value, list): + return value + return [value] + + +def _dump(data: Any) -> str: + return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True) + + +# ───────────────────────────────────────────────────────────────────────────── +# CLI +# ───────────────────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Bidirectional OSI ↔ Honeydew semantic model converter" + ) + sub = parser.add_subparsers(dest="command", required=True) + + p1 = sub.add_parser("osi-to-honeydew", help="Convert OSI YAML → Honeydew workspace") + p1.add_argument("-i", "--input", required=True, help="OSI YAML input file") + p1.add_argument("-o", "--output", required=True, help="Output directory for Honeydew workspace") + + p2 = sub.add_parser("honeydew-to-osi", help="Convert Honeydew workspace → OSI YAML") + p2.add_argument("-i", "--input", required=True, help="Honeydew workspace directory") + p2.add_argument("-o", "--output", required=True, help="OSI YAML output file") + + args = parser.parse_args() + + if args.command == "osi-to-honeydew": + with open(args.input) as f: + osi_yaml = f.read() + try: + files = convert_osi_to_honeydew(osi_yaml) + except HoneydewConversionError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + for rel_path, content in files.items(): + full_path = os.path.join(args.output, rel_path) + os.makedirs(os.path.dirname(full_path), exist_ok=True) + with open(full_path, "w") as f: + f.write(content) + print(f"Wrote {len(files)} file(s) to {args.output}") + + elif args.command == "honeydew-to-osi": + try: + osi_yaml = convert_honeydew_to_osi(args.input) + except HoneydewConversionError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + with open(args.output, "w") as f: + f.write(osi_yaml) + print(f"Converted {args.input} → {args.output}") + + +if __name__ == "__main__": + main() diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py new file mode 100644 index 0000000..a4b683d --- /dev/null +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -0,0 +1,888 @@ +"""Tests for the bidirectional OSI ↔ Honeydew converter.""" + +from __future__ import annotations + +import json +import os +import sys +import warnings +from pathlib import Path + +import pytest +import yaml + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) +from honeydew_osi_converter import ( + HoneydewConversionError, + _assign_metrics_to_entities, + _build_osi_metadata, + _find_entity_in_expression, + _honeydew_datatype_to_osi_dimension, + _is_simple_identifier, + _osi_field_to_honeydew_datatype, + _parse_osi_source, + _pick_ansi_expression, + _read_osi_metadata, + convert_honeydew_to_osi, + convert_osi_to_honeydew, +) + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +OSI_VERSION = "0.2.0.dev0" + + +def _osi(model_dict): + return yaml.dump( + {"version": OSI_VERSION, "semantic_model": [model_dict]}, + default_flow_style=False, + sort_keys=False, + ) + + +def _minimal_osi_field(name, expr, is_dimension=True, is_time=False): + field = { + "name": name, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": expr}]}, + } + if is_dimension: + field["dimension"] = {"is_time": is_time} + return field + + +def _minimal_model(): + return { + "name": "test_model", + "datasets": [ + { + "name": "orders", + "source": "db.schema.orders", + "primary_key": ["order_id"], + "fields": [ + _minimal_osi_field("order_id", "order_id"), + _minimal_osi_field("order_date", "order_date", is_time=True), + _minimal_osi_field("total", "total_amount", is_dimension=False), + ], + } + ], + } + + +def _write_workspace(tmp_dir, workspace_name, entities): + """Write a minimal Honeydew workspace to tmp_dir.""" + workspace_path = os.path.join(tmp_dir, "workspace.yml") + with open(workspace_path, "w") as f: + yaml.dump({"type": "workspace", "name": workspace_name}, f) + + for e in entities: + ename = e["name"] + base = os.path.join(tmp_dir, "schema", ename) + os.makedirs(os.path.join(base, "datasets"), exist_ok=True) + os.makedirs(os.path.join(base, "attributes"), exist_ok=True) + os.makedirs(os.path.join(base, "metrics"), exist_ok=True) + + entity_dict = { + "type": "entity", + "name": ename, + "keys": e.get("keys", []), + "key_dataset": e.get("key_dataset", ename), + "relations": e.get("relations", []), + } + with open(os.path.join(base, f"{ename}.yml"), "w") as f: + yaml.dump(entity_dict, f) + + ds_name = e.get("key_dataset", ename) + ds_dict = { + "type": "dataset", + "entity": ename, + "name": ds_name, + "sql": e.get("sql", "DB.SCHEMA." + ename.upper()), + "dataset_type": "table", + "attributes": e.get("dataset_attrs", []), + } + with open(os.path.join(base, "datasets", f"{ds_name}.yml"), "w") as f: + yaml.dump(ds_dict, f) + + for attr in e.get("calc_attrs", []): + with open(os.path.join(base, "attributes", f"{attr['name']}.yml"), "w") as f: + yaml.dump(attr, f) + + for m in e.get("metrics", []): + with open(os.path.join(base, "metrics", f"{m['name']}.yml"), "w") as f: + yaml.dump(m, f) + + +# ───────────────────────────────────────────────────────────────────────────── +# Unit tests – helpers +# ───────────────────────────────────────────────────────────────────────────── + +class TestIsSimpleIdentifier: + def test_plain_name(self): + assert _is_simple_identifier("order_id") is True + + def test_with_spaces(self): + assert _is_simple_identifier("SUM(x)") is False + + def test_with_dot(self): + assert _is_simple_identifier("orders.id") is False + + def test_leading_number(self): + assert _is_simple_identifier("1col") is False + + def test_underscore_prefix(self): + assert _is_simple_identifier("_hidden") is True + + +class TestParseOsiSource: + def test_table_reference(self): + sql, dtype = _parse_osi_source("db.schema.table") + assert sql == "db.schema.table" and dtype == "table" + + def test_select_query(self): + _, dtype = _parse_osi_source("SELECT id FROM foo") + assert dtype == "sql" + + def test_with_query(self): + _, dtype = _parse_osi_source("WITH cte AS (SELECT 1) SELECT * FROM cte") + assert dtype == "sql" + + def test_empty(self): + sql, dtype = _parse_osi_source("") + assert sql == "" and dtype == "table" + + +class TestPickAnsiExpression: + def test_ansi_preferred(self): + expr = {"dialects": [ + {"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}, + {"dialect": "ANSI_SQL", "expression": "col"}, + ]} + assert _pick_ansi_expression(expr, "f") == "col" + + def test_fallback_to_first(self): + expr = {"dialects": [{"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = _pick_ansi_expression(expr, "f") + assert result == "col::VARCHAR" + assert any("ANSI_SQL" in str(x.message) for x in w) + + def test_none_on_missing(self): + assert _pick_ansi_expression(None, "f") is None + assert _pick_ansi_expression({"dialects": []}, "f") is None + + +class TestOsiFieldDatatypes: + def test_time_dimension(self): + assert _osi_field_to_honeydew_datatype({"dimension": {"is_time": True}}) == "timestamp" + + def test_dimension(self): + assert _osi_field_to_honeydew_datatype({"dimension": {"is_time": False}}) == "string" + + def test_fact(self): + assert _osi_field_to_honeydew_datatype({}) == "number" + + +class TestHoneydewDatatypeToOsiDimension: + def test_date(self): + assert _honeydew_datatype_to_osi_dimension("date") == {"is_time": True} + + def test_timestamp(self): + assert _honeydew_datatype_to_osi_dimension("timestamp") == {"is_time": True} + + def test_string(self): + assert _honeydew_datatype_to_osi_dimension("string") == {"is_time": False} + + def test_bool(self): + assert _honeydew_datatype_to_osi_dimension("bool") == {"is_time": False} + + def test_number(self): + assert _honeydew_datatype_to_osi_dimension("number") is None + + def test_float(self): + assert _honeydew_datatype_to_osi_dimension("float") is None + + +class TestFindEntityInExpression: + def test_finds_entity(self): + assert _find_entity_in_expression("SUM(orders.total)", {"orders", "customers"}) == "orders" + + def test_returns_first_match(self): + result = _find_entity_in_expression("orders.a / customers.b", {"orders", "customers"}) + assert result == "orders" + + def test_no_match(self): + assert _find_entity_in_expression("COUNT(*)", {"orders"}) is None + + def test_ignores_non_entity_prefixes(self): + assert _find_entity_in_expression("SUM(foo.col)", {"orders"}) is None + + +class TestOsiMetadataHelpers: + def test_build_and_read_ai_context_string(self): + section = _build_osi_metadata(ai_context="orders, purchases") + obj = {"metadata": [section]} + result = _read_osi_metadata(obj) + assert result["ai_context"] == "orders, purchases" + + def test_build_and_read_ai_context_dict(self): + ctx = {"instructions": "Use for sales", "synonyms": ["orders", "purchases"]} + section = _build_osi_metadata(ai_context=ctx) + obj = {"metadata": [section]} + result = _read_osi_metadata(obj) + assert result["ai_context"] == ctx + + def test_build_and_read_unique_keys(self): + uks = [["col1", "col2"], ["col3"]] + section = _build_osi_metadata(unique_keys=uks) + obj = {"metadata": [section]} + result = _read_osi_metadata(obj) + assert result["unique_keys"] == uks + + def test_build_and_read_custom_extensions(self): + exts = [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}] + section = _build_osi_metadata(custom_extensions=exts) + obj = {"metadata": [section]} + result = _read_osi_metadata(obj) + assert result["custom_extensions"] == exts + + def test_returns_empty_when_no_osi_section(self): + obj = {"metadata": [{"name": "other", "metadata": []}]} + assert _read_osi_metadata(obj) == {} + + def test_returns_empty_when_no_metadata(self): + assert _read_osi_metadata({}) == {} + + def test_build_returns_none_when_nothing_to_store(self): + assert _build_osi_metadata() is None + + +class TestAssignMetricsToEntities: + def test_assigns_by_expression(self): + metrics = [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}] + result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) + assert "total" in [m["name"] for m in result.get("orders", [])] + + def test_honeydew_hint_takes_priority(self): + metrics = [{ + "name": "cnt", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], + }] + result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) + # hint says customers even though expression references orders + assert "cnt" in [m["name"] for m in result.get("customers", [])] + assert "orders" not in result + + def test_falls_back_to_first_entity(self): + metrics = [{"name": "cnt", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] + with warnings.catch_warnings(record=True): + result = _assign_metrics_to_entities(metrics, ["orders"]) + assert "cnt" in [m["name"] for m in result.get("orders", [])] + + def test_no_entities(self): + metrics = [{"name": "m", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] + with warnings.catch_warnings(record=True): + result = _assign_metrics_to_entities(metrics, []) + assert result == {} + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI → Honeydew integration tests +# ───────────────────────────────────────────────────────────────────────────── + +class TestOsiToHoneydew: + def test_workspace_yml_created(self): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ws = yaml.safe_load(files["workspace.yml"]) + assert ws["name"] == "test_model" and ws["type"] == "workspace" + + def test_entity_yml_created(self): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + assert entity["name"] == "orders" + assert entity["keys"] == ["order_id"] + assert entity["key_dataset"] == "orders" + + def test_dataset_yml_created(self): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert ds["sql"] == "db.schema.orders" + assert ds["dataset_type"] == "table" + + def test_simple_fields_become_dataset_attributes(self): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + names = [a["name"] for a in ds["attributes"]] + assert "order_id" in names and "order_date" in names and "total" in names + + def test_time_field_gets_timestamp_datatype(self): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["order_date"]["datatype"] == "timestamp" + + def test_fact_field_gets_number_datatype(self): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["total"]["datatype"] == "number" + + def test_complex_expression_becomes_calculated_attribute(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "disc_price", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, + "dimension": {"is_time": False}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/orders/attributes/disc_price.yml" in files + calc = yaml.safe_load(files["schema/orders/attributes/disc_price.yml"]) + assert calc["type"] == "calculated_attribute" + assert calc["sql"] == "price * (1 - discount)" + + def test_label_mapped_to_honeydew_labels(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}, + "label": "sales", + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "sales" in attrs["status"]["labels"] + + def test_ai_context_string_merged_into_description(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "description": "Base desc", + "ai_context": "revenue, earnings", + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "revenue, earnings" in attrs["total"]["description"] + + def test_ai_context_dict_instructions_merged_into_description(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "Use for revenue" in attrs["total"]["description"] + assert "rev" in attrs["total"]["labels"] + + def test_ai_context_dict_stored_in_metadata(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev"]}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attr = next(a for a in ds["attributes"] if a["name"] == "total") + # Should be in the osi metadata section + osi_section = next((s for s in attr.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + ai_item = next((i for i in osi_section["metadata"] if i["name"] == "ai_context"), None) + assert ai_item is not None + + def test_unique_keys_stored_in_entity_metadata(self): + model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["item_id"], + "unique_keys": [["sku"], ["item_id", "variant"]], + "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/items/items.yml"]) + osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + uk_item = next((i for i in osi_section["metadata"] if i["name"] == "unique_keys"), None) + assert uk_item is not None + assert json.loads(uk_item["value"]) == [["sku"], ["item_id", "variant"]] + + def test_non_honeydew_custom_extensions_stored_in_metadata(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + ext_item = next((i for i in osi_section["metadata"] if i["name"] == "custom_extensions"), None) + assert ext_item is not None + exts = json.loads(ext_item["value"]) + assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) + + def test_relationship_name_stored_in_relation(self): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + assert entity["relations"][0]["name"] == "orders_to_customers" + + def test_model_ai_context_stored_in_workspace_metadata(self): + model = {"name": "m", "datasets": [], + "ai_context": {"instructions": "Use for retail analytics", "synonyms": ["store"]}} + files = convert_osi_to_honeydew(_osi(model)) + ws = yaml.safe_load(files["workspace.yml"]) + osi_section = next((s for s in ws.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + + def test_relationship_added_to_entity(self): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + assert len(entity["relations"]) == 1 + rel = entity["relations"][0] + assert rel["target_entity"] == "customers" + assert rel["rel_type"] == "many-to-one" + assert rel["connection"] == [{"src_field": "cid", "target_field": "id"}] + + def test_to_entity_has_no_relation(self): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/customers/customers.yml"]) + assert entity["relations"] == [] + + def test_metric_assigned_by_expression_entity(self): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/orders/metrics/total.yml" in files + + def test_metric_entity_hint_overrides_expression(self): + model = {"name": "m", + "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], + "metrics": [{ + "name": "cnt", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], + }]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/customers/metrics/cnt.yml" in files + assert "schema/orders/metrics/cnt.yml" not in files + + def test_invalid_version_raises(self): + with pytest.raises(HoneydewConversionError, match="Unsupported"): + convert_osi_to_honeydew("version: '9.9.9'\nsemantic_model:\n - name: m\n") + + def test_missing_semantic_model_raises(self): + with pytest.raises(HoneydewConversionError): + convert_osi_to_honeydew(f"version: '{OSI_VERSION}'\n") + + def test_subquery_source_uses_sql_type(self): + model = {"name": "m", "datasets": [{"name": "orders", + "source": "SELECT * FROM raw.orders WHERE active = true", "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert ds["dataset_type"] == "sql" + + def test_composite_primary_key(self): + model = {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", + "primary_key": ["order_id", "line_number"], "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/li/li.yml"]) + assert entity["keys"] == ["order_id", "line_number"] + + def test_multiple_semantic_models_warns(self): + doc = yaml.dump({"version": OSI_VERSION, "semantic_model": [ + {"name": "m1", "datasets": []}, + {"name": "m2", "datasets": []}, + ]}) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(doc) + assert any("only the first" in str(x.message) for x in w) + assert yaml.safe_load(files["workspace.yml"])["name"] == "m1" + + +# ───────────────────────────────────────────────────────────────────────────── +# Honeydew → OSI integration tests +# ───────────────────────────────────────────────────────────────────────────── + +class TestHoneydewToOsi: + def test_basic_conversion(self, tmp_path): + _write_workspace(str(tmp_path), "tpch", [{ + "name": "orders", "keys": ["orderkey"], "key_dataset": "tpch_orders", + "sql": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", + "dataset_attrs": [ + {"column": "o_orderkey", "name": "orderkey", "datatype": "number"}, + {"column": "o_orderdate", "name": "orderdate", "datatype": "date"}, + ], + }]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + sm = result["semantic_model"][0] + assert sm["name"] == "tpch" + ds = sm["datasets"][0] + assert ds["source"] == "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS" + assert ds["primary_key"] == ["orderkey"] + + def test_field_types_from_datatypes(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [ + {"column": "id", "name": "id", "datatype": "number"}, + {"column": "status", "name": "status", "datatype": "string"}, + {"column": "created_at", "name": "created_at", "datatype": "timestamp"}, + ]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} + assert fields["id"].get("dimension") is None + assert fields["status"]["dimension"] == {"is_time": False} + assert fields["created_at"]["dimension"] == {"is_time": True} + + def test_labels_become_osi_label_and_ai_context(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [ + {"column": "status", "name": "status", "datatype": "string", + "labels": ["sales", "reporting"]}, + ]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + f = next(f for f in result["semantic_model"][0]["datasets"][0]["fields"] if f["name"] == "status") + assert f["label"] == "sales" + assert "sales" in (f.get("ai_context") or {}).get("synonyms", []) + + def test_many_to_one_relation_to_osi(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "customer_id", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + rels = result["semantic_model"][0]["relationships"] + assert len(rels) == 1 + assert rels[0]["from"] == "orders" and rels[0]["to"] == "customers" + + def test_one_to_many_direction_flipped(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "customer_id"}]}], + "dataset_attrs": []}, + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + rel = result["semantic_model"][0]["relationships"][0] + assert rel["from"] == "orders" and rel["to"] == "customers" + + def test_duplicate_relations_deduplicated(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "cid"}]}], + "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert len(result["semantic_model"][0].get("relationships", [])) == 1 + + def test_metrics_converted(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "count", + "datatype": "number", "sql": "COUNT(*)"}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + m = result["semantic_model"][0]["metrics"][0] + assert m["name"] == "count" + assert m["expression"]["dialects"][0]["expression"] == "COUNT(*)" + + def test_metric_entity_preserved_in_custom_extension(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + m = result["semantic_model"][0]["metrics"][0] + ext = m["custom_extensions"][0] + assert ext["vendor_name"] == "HONEYDEW" + assert json.loads(ext["data"])["entity"] == "orders" + + def test_calculated_attribute_as_field(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "discounted", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} + assert "discounted" in fields + assert "orders.price" in fields["discounted"]["expression"]["dialects"][0]["expression"] + + def test_missing_workspace_yml_raises(self, tmp_path): + with pytest.raises(HoneydewConversionError, match="workspace.yml"): + convert_honeydew_to_osi(str(tmp_path)) + + def test_missing_schema_dir_produces_empty_model(self, tmp_path): + (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert result["semantic_model"][0]["datasets"] == [] + + def test_vendors_includes_honeydew(self, tmp_path): + (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert "HONEYDEW" in result.get("vendors", []) + + def test_empty_metrics_skipped(self, tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "bad", + "datatype": "number", "sql": ""}]}]) + with warnings.catch_warnings(record=True): + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert "metrics" not in result["semantic_model"][0] + + +# ───────────────────────────────────────────────────────────────────────────── +# Round-trip tests (idempotency) +# ───────────────────────────────────────────────────────────────────────────── + +class TestOsiToHoneydewToOsiRoundTrip: + """OSI → Honeydew → OSI: verify key fields survive both legs.""" + + def _roundtrip(self, model_dict, tmp_path): + files = convert_osi_to_honeydew(_osi(model_dict)) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + return yaml.safe_load(convert_honeydew_to_osi(str(tmp_path)))["semantic_model"][0] + + def test_name_and_description_preserved(self, tmp_path): + model = {"name": "retail", "description": "Retail model", "datasets": []} + sm = self._roundtrip(model, tmp_path) + assert sm["name"] == "retail" + assert sm["description"] == "Retail model" + + def test_primary_key_preserved(self, tmp_path): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id"], "fields": []}]} + sm = self._roundtrip(model, tmp_path) + assert sm["datasets"][0]["primary_key"] == ["order_id"] + + def test_composite_primary_key_preserved(self, tmp_path): + model = {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", + "primary_key": ["order_id", "line_no"], "fields": []}]} + sm = self._roundtrip(model, tmp_path) + assert sm["datasets"][0]["primary_key"] == ["order_id", "line_no"] + + def test_unique_keys_preserved(self, tmp_path): + model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["id"], + "unique_keys": [["sku"], ["id", "variant"]], + "fields": []}]} + sm = self._roundtrip(model, tmp_path) + assert sm["datasets"][0]["unique_keys"] == [["sku"], ["id", "variant"]] + + def test_field_label_preserved(self, tmp_path): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "label": "sales", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}}]}]} + sm = self._roundtrip(model, tmp_path) + f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") + assert f["label"] == "sales" + + def test_ai_context_string_preserved(self, tmp_path): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "ai_context": "order status, order state", + "dimension": {"is_time": False}}]}]} + sm = self._roundtrip(model, tmp_path) + f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") + # String ai_context is merged into description; exact form may vary + assert f.get("description") or f.get("ai_context") + + def test_ai_context_dict_preserved(self, tmp_path): + ctx = {"instructions": "Use for revenue analysis", "synonyms": ["revenue", "sales"]} + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": ctx}]}]} + sm = self._roundtrip(model, tmp_path) + f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "total") + assert f.get("ai_context") == ctx + + def test_model_ai_context_preserved(self, tmp_path): + ctx = {"instructions": "Retail analytics", "synonyms": ["store"]} + model = {"name": "m", "ai_context": ctx, "datasets": []} + sm = self._roundtrip(model, tmp_path) + assert sm.get("ai_context") == ctx + + def test_non_honeydew_custom_extensions_preserved(self, tmp_path): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]} + sm = self._roundtrip(model, tmp_path) + exts = sm["datasets"][0].get("custom_extensions") or [] + assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) + + def test_relationship_name_preserved(self, tmp_path): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + sm = self._roundtrip(model, tmp_path) + assert sm["relationships"][0]["name"] == "orders_to_customers" + + def test_relationship_columns_preserved(self, tmp_path): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + sm = self._roundtrip(model, tmp_path) + rel = sm["relationships"][0] + assert rel["from_columns"] == ["cid"] and rel["to_columns"] == ["id"] + + def test_metric_name_and_expression_preserved(self, tmp_path): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total_revenue", "description": "Sum of sales", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + sm = self._roundtrip(model, tmp_path) + m = sm["metrics"][0] + assert m["name"] == "total_revenue" + assert m["expression"]["dialects"][0]["expression"] == "SUM(orders.total)" + assert m["description"] == "Sum of sales" + + def test_tpcds_example_roundtrip(self, tmp_path): + tpcds_path = ( + Path(__file__).resolve().parent.parent.parent.parent + / "examples" / "tpcds_semantic_model.yaml" + ) + if not tpcds_path.exists(): + pytest.skip("TPC-DS example not found") + osi_yaml = tpcds_path.read_text() + files = convert_osi_to_honeydew(osi_yaml) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + sm = result["semantic_model"][0] + assert sm["name"] == "tpcds_retail_model" + ds_names = {ds["name"] for ds in sm["datasets"]} + assert "store_sales" in ds_names and "customer" in ds_names + + +class TestHoneydewToOsiToHoneydewRoundTrip: + """Honeydew → OSI → Honeydew: verify Honeydew-specific fields survive.""" + + def _roundtrip(self, entities, tmp_path): + _write_workspace(str(tmp_path), "ws", entities) + osi_yaml = convert_honeydew_to_osi(str(tmp_path)) + files = convert_osi_to_honeydew(osi_yaml) + # Write to a second directory + out_dir = tmp_path / "out" + for rel_path, content in files.items(): + p = out_dir / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + return out_dir + + def test_entity_name_and_keys_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["order_id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + }], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + assert entity["name"] == "orders" + assert entity["keys"] == ["order_id"] + + def test_source_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.SCHEMA.ORDERS", "dataset_attrs": [], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + assert ds["sql"] == "DB.SCHEMA.ORDERS" + + def test_column_attributes_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "o_id", "name": "id", "datatype": "number"}, + {"column": "o_status", "name": "status", "datatype": "string"}, + ], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["id"]["column"] == "o_id" + assert attrs["status"]["datatype"] == "string" + + def test_labels_preserved_on_column(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "status", "name": "status", "datatype": "string", "labels": ["sales"]}, + ], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "sales" in attrs["status"].get("labels", []) + + def test_calculated_attribute_sql_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}], + }], tmp_path) + calc = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) + assert calc["sql"] == "orders.price * (1 - orders.discount)" + + def test_metric_entity_assignment_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}], + }], tmp_path) + m = yaml.safe_load((out_dir / "schema/orders/metrics/cnt.yml").read_text()) + assert m["entity"] == "orders" + assert m["sql"] == "COUNT(*)" + + def test_relation_preserved(self, tmp_path): + out_dir = self._roundtrip([ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + assert entity["relations"][0]["target_entity"] == "customers" + assert entity["relations"][0]["connection"][0]["src_field"] == "cid" From ee2e0e93d784222f5f16b78e58e4750500f38476 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 15:42:36 +0300 Subject: [PATCH 02/13] Add .gitignore for honeydew converter Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/.gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 converters/honeydew/.gitignore diff --git a/converters/honeydew/.gitignore b/converters/honeydew/.gitignore new file mode 100644 index 0000000..b5fed1f --- /dev/null +++ b/converters/honeydew/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.py[cod] +.pytest_cache/ +*.egg-info/ +dist/ +build/ +.venv/ +venv/ From f5f4a717013cdf571654fd049f03d44ccfa0909a Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 16:12:32 +0300 Subject: [PATCH 03/13] Fix round-trip bugs and update README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix empty-string expression bypassing None guard - Restore calculated_attribute routing via HONEYDEW type hint - Preserve bool datatype through OSI round-trip - Store string metric ai_context in osi metadata for recovery - Warn on duplicate metric names instead of silently overwriting - Restore connection_expr from HONEYDEW custom_extension on OSI→Honeydew - Warn on malformed JSON in _read_osi_metadata instead of silent drop - Remove filter limitation from README (not applicable) - Update Honeydew docs link to honeydew.ai/docs Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/README.md | 7 +- .../honeydew/src/honeydew_osi_converter.py | 29 ++++- .../tests/test_honeydew_osi_converter.py | 107 ++++++++++++++++++ 3 files changed, 134 insertions(+), 9 deletions(-) diff --git a/converters/honeydew/README.md b/converters/honeydew/README.md index eb0e54b..8f8a25e 100644 --- a/converters/honeydew/README.md +++ b/converters/honeydew/README.md @@ -1,6 +1,6 @@ # OSI ↔ Honeydew Converter -Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models and [Honeydew](https://docs.honeydew.ai) workspace YAML. +Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models and [Honeydew](https://honeydew.ai/docs) workspace YAML. ## Overview @@ -62,7 +62,6 @@ python -m pytest tests/ - **One dataset per entity**: The converter maps each OSI dataset to a single Honeydew entity with one source dataset. Multiple datasets per entity are not generated. - **Datatype inference**: OSI fields have no explicit datatype; the converter infers Honeydew datatypes from the `dimension.is_time` flag (`timestamp`) and the presence/absence of the `dimension` key (`string` vs `number`). - **Honeydew SQL expressions**: Calculated attributes and metrics use Honeydew's `entity.attribute` reference syntax. These are exported as `ANSI_SQL` dialect expressions in OSI; they remain valid for round-tripping but may not run on other databases without adaptation. -- **Filters**: Honeydew `filter` objects have no OSI equivalent and are not exported. - **Perspectives and domains**: Not converted (no OSI equivalent). -- **Connection expressions** (`connection_expr`): Preserved in `HONEYDEW` custom extensions on the OSI relationship. -- **`ai_context`**: OSI `ai_context` fields (synonyms, instructions) are dropped during OSI → Honeydew conversion (no native Honeydew equivalent). Honeydew `description` fields are mapped to OSI `description`. +- **Connection expressions** (`connection_expr`): Preserved in `HONEYDEW` custom extensions on the OSI relationship and restored on the return trip. +- **`ai_context`**: OSI `ai_context` fields (synonyms, instructions) are stored in Honeydew `metadata` for round-trip recovery. Instructions are also merged into `description` for human readability. diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index 0fb9e0a..30c8dcd 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -192,7 +192,7 @@ def _dataset_to_files( raise HoneydewConversionError(f"Field missing 'name' in dataset '{entity_name}'") expr = _pick_ansi_expression(field.get("expression"), field_name) - if expr is None: + if not expr: continue datatype = _osi_field_to_honeydew_datatype(field) @@ -223,7 +223,10 @@ def _dataset_to_files( custom_extensions=field_ext or None, ) - if _is_simple_identifier(expr): + hd_hint = _get_honeydew_extension(field) + force_calc = hd_hint.get("type") == "calculated_attribute" + + if _is_simple_identifier(expr) and not force_calc: attr: dict[str, Any] = {"column": expr, "name": field_name, "datatype": datatype} if effective_desc: attr["description"] = effective_desc @@ -293,13 +296,18 @@ def _dataset_to_files( metric_ext = [e for e in (metric.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] metric_meta = _build_osi_metadata( - ai_context=metric_ai_ctx if isinstance(metric_ai_ctx, dict) else None, + ai_context=metric_ai_ctx, custom_extensions=metric_ext or None, ) if metric_meta: metric_dict["metadata"] = [metric_meta] - files[f"{base}/metrics/{mname}.yml"] = _dump(metric_dict) + metric_path = f"{base}/metrics/{mname}.yml" + if metric_path in files: + warnings.warn( + f"Metric '{mname}' in entity '{entity_name}' is defined more than once; later definition wins" + ) + files[metric_path] = _dump(metric_dict) return files @@ -331,6 +339,10 @@ def _osi_relation_to_honeydew(rel: dict[str, Any]) -> dict[str, Any] | None: {"src_field": fc, "target_field": tc} for fc, tc in zip(from_cols, to_cols) ] + elif not from_cols: + hd_ext = _get_honeydew_extension(rel) + if hd_ext.get("connection_expr"): + honeydew_rel["connection_expr"] = {"sql": hd_ext["connection_expr"]} return honeydew_rel @@ -364,6 +376,9 @@ def _pick_ansi_expression(expression: Any, field_name: str) -> str | None: def _osi_field_to_honeydew_datatype(field: dict[str, Any]) -> str: + hd_ext = _get_honeydew_extension(field) + if hd_ext.get("datatype"): + return hd_ext["datatype"] dimension = field.get("dimension") if isinstance(dimension, dict) and dimension.get("is_time"): return "timestamp" @@ -660,6 +675,8 @@ def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: k: attr[k] for k in ("display_name", "hidden", "folder", "format_string", "timegrain") if k in attr } + if datatype == "bool": + attr_honeydew_extra["datatype"] = datatype if len(attr_labels) > 1: attr_honeydew_extra["labels"] = attr_labels @@ -707,6 +724,8 @@ def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: k: calc[k] for k in ("display_name", "hidden", "folder", "format_string", "timegrain") if k in calc } + if datatype == "bool": + calc_honeydew_extra["datatype"] = datatype all_calc_ext = list(calc_osi_meta.get("custom_extensions") or []) # Always mark as calculated_attribute so OSI → Honeydew routes it correctly @@ -860,7 +879,7 @@ def _read_osi_metadata(obj: dict[str, Any]) -> dict[str, Any]: try: result[key] = json.loads(raw) except (json.JSONDecodeError, TypeError): - pass + warnings.warn(f"Could not parse OSI metadata field '{key}': {raw!r}") return result return {} diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index a4b683d..81a449d 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -886,3 +886,110 @@ def test_relation_preserved(self, tmp_path): entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) assert entity["relations"][0]["target_entity"] == "customers" assert entity["relations"][0]["connection"][0]["src_field"] == "cid" + + def test_bool_datatype_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "is_active", "name": "is_active", "datatype": "bool"}, + ], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["is_active"]["datatype"] == "bool" + + def test_connection_expr_preserved(self, tmp_path): + out_dir = self._roundtrip([ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + rel = entity["relations"][0] + assert rel.get("connection_expr", {}).get("sql") == "orders.cid = customers.id AND orders.region = customers.region" + + def test_calc_attr_with_simple_identifier_sql_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "revenue", "datatype": "number", "sql": "revenue"}], + }], tmp_path) + # sql='revenue' is a simple identifier — must still come back as calculated_attribute + calc_path = out_dir / "schema/orders/attributes/revenue.yml" + assert calc_path.exists(), "calculated_attribute with simple-id sql should not become a dataset column" + calc = yaml.safe_load(calc_path.read_text()) + assert calc["sql"] == "revenue" + + +# ───────────────────────────────────────────────────────────────────────────── +# Bug-fix regression tests +# ───────────────────────────────────────────────────────────────────────────── + +class TestBugFixes: + def test_empty_string_expression_skipped(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}, + "dimension": {"is_time": False}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + names = [a["name"] for a in ds["attributes"]] + assert "bad" not in names + assert "schema/orders/attributes/bad.yml" not in files + + def test_duplicate_metric_name_warns(self): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [ + {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.a)"}]}}, + {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.b)"}]}}, + ]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(_osi(model)) + assert any("total" in str(x.message) for x in w) + # Last definition wins + m = yaml.safe_load(files["schema/orders/metrics/total.yml"]) + assert "orders.b" in m["sql"] + + def test_metric_string_ai_context_preserved_in_roundtrip(self, tmp_path): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "rev", "ai_context": "Use for revenue analysis", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + files = convert_osi_to_honeydew(_osi(model)) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + m = result["semantic_model"][0]["metrics"][0] + assert m.get("ai_context") == "Use for revenue analysis" + + def test_malformed_osi_metadata_json_warns(self, tmp_path): + ws_path = tmp_path / "workspace.yml" + ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) + base = tmp_path / "schema" / "orders" + (base / "datasets").mkdir(parents=True) + entity = { + "type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "unique_keys", "value": "[broken json"}, + ]}], + } + (base / "orders.yml").write_text(yaml.dump(entity)) + (base / "datasets" / "orders.yml").write_text(yaml.dump( + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", "attributes": []} + )) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + convert_honeydew_to_osi(str(tmp_path)) + assert any("unique_keys" in str(x.message) for x in w) From 7cc9ea3a11ff9d07f1da7c386291dd0d12b05d65 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 16:29:58 +0300 Subject: [PATCH 04/13] Fix code review findings: round-trip data loss, path traversal, expression guards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Restore Honeydew-specific fields (display_name, hidden, format_string, timegrain, owner, folder) in OSI→Honeydew direction by reading them back from the HONEYDEW custom_extension on both entity and attribute objects - Add path traversal guard in main() using os.path.normpath + startswith check - Guard against whitespace-only expressions (not expr or not expr.strip()) - Warn when field expression is a non-dict value instead of silently dropping it - Simplify elif not from_cols: to else: in _osi_relation_to_honeydew - Strengthen test_ai_context_string_preserved to assert the string value is recoverable in description - Add 5 new tests: display_name/format round-trip, calc attr Honeydew fields, entity owner, whitespace expression skipped, non-dict expression warns Co-Authored-By: Claude Sonnet 4.6 --- .../honeydew/src/honeydew_osi_converter.py | 31 ++++++- .../tests/test_honeydew_osi_converter.py | 86 ++++++++++++++++++- 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index 30c8dcd..25d9ab0 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -164,6 +164,14 @@ def _dataset_to_files( entity_dict["keys"] = list(primary_key) entity_dict["key_dataset"] = entity_name + # Restore Honeydew-specific entity fields from HONEYDEW custom_extension + entity_hd_hint = _get_honeydew_extension(ds) + for key in ("owner", "display_name", "hidden", "folder"): + if key in entity_hd_hint: + entity_dict[key] = entity_hd_hint[key] + if "labels" in entity_hd_hint: + entity_dict["labels"] = entity_hd_hint["labels"] + honeydew_relations = [] for rel in relations: hr = _osi_relation_to_honeydew(rel) @@ -192,7 +200,7 @@ def _dataset_to_files( raise HoneydewConversionError(f"Field missing 'name' in dataset '{entity_name}'") expr = _pick_ansi_expression(field.get("expression"), field_name) - if not expr: + if not expr or not expr.strip(): continue datatype = _osi_field_to_honeydew_datatype(field) @@ -226,6 +234,8 @@ def _dataset_to_files( hd_hint = _get_honeydew_extension(field) force_calc = hd_hint.get("type") == "calculated_attribute" + _hd_attr_keys = ("display_name", "hidden", "folder", "format_string", "timegrain") + if _is_simple_identifier(expr) and not force_calc: attr: dict[str, Any] = {"column": expr, "name": field_name, "datatype": datatype} if effective_desc: @@ -234,6 +244,9 @@ def _dataset_to_files( attr["labels"] = labels if field_meta: attr["metadata"] = [field_meta] + for _k in _hd_attr_keys: + if _k in hd_hint: + attr[_k] = hd_hint[_k] dataset_attrs.append(attr) else: calc: dict[str, Any] = { @@ -249,6 +262,9 @@ def _dataset_to_files( calc["labels"] = labels if field_meta: calc["metadata"] = [field_meta] + for _k in _hd_attr_keys: + if _k in hd_hint: + calc[_k] = hd_hint[_k] calc_attrs.append(calc) # ── dataset YAML ─────────────────────────────────────────────────────────── @@ -276,7 +292,7 @@ def _dataset_to_files( if not mname: continue mexpr = _pick_ansi_expression(metric.get("expression"), mname) - if mexpr is None: + if not mexpr or not mexpr.strip(): continue metric_dict: dict[str, Any] = { @@ -339,7 +355,7 @@ def _osi_relation_to_honeydew(rel: dict[str, Any]) -> dict[str, Any] | None: {"src_field": fc, "target_field": tc} for fc, tc in zip(from_cols, to_cols) ] - elif not from_cols: + else: hd_ext = _get_honeydew_extension(rel) if hd_ext.get("connection_expr"): honeydew_rel["connection_expr"] = {"sql": hd_ext["connection_expr"]} @@ -348,7 +364,10 @@ def _osi_relation_to_honeydew(rel: dict[str, Any]) -> dict[str, Any] | None: def _pick_ansi_expression(expression: Any, field_name: str) -> str | None: """Select the ANSI_SQL expression; fall back to first available dialect.""" + if expression is None: + return None if not isinstance(expression, dict): + warnings.warn(f"'{field_name}': 'expression' must be a mapping; field will be skipped") return None dialects = expression.get("dialects") or [] if not dialects: @@ -942,8 +961,12 @@ def main() -> None: print(f"Error: {e}", file=sys.stderr) sys.exit(1) + output_abs = os.path.abspath(args.output) for rel_path, content in files.items(): - full_path = os.path.join(args.output, rel_path) + full_path = os.path.normpath(os.path.join(output_abs, rel_path)) + if not full_path.startswith(output_abs + os.sep): + print(f"Error: refusing to write outside output directory: {rel_path}", file=sys.stderr) + sys.exit(1) os.makedirs(os.path.dirname(full_path), exist_ok=True) with open(full_path, "w") as f: f.write(content) diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index 81a449d..9115e7c 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -710,15 +710,16 @@ def test_field_label_preserved(self, tmp_path): assert f["label"] == "sales" def test_ai_context_string_preserved(self, tmp_path): + ai_ctx_value = "order status, order state" model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{"name": "status", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "ai_context": "order status, order state", + "ai_context": ai_ctx_value, "dimension": {"is_time": False}}]}]} sm = self._roundtrip(model, tmp_path) f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") - # String ai_context is merged into description; exact form may vary - assert f.get("description") or f.get("ai_context") + # String ai_context is merged into description on OSI→Honeydew; value must be recoverable + assert ai_ctx_value in (f.get("description") or "") or f.get("ai_context") == ai_ctx_value def test_ai_context_dict_preserved(self, tmp_path): ctx = {"instructions": "Use for revenue analysis", "synonyms": ["revenue", "sales"]} @@ -912,6 +913,60 @@ def test_connection_expr_preserved(self, tmp_path): rel = entity["relations"][0] assert rel.get("connection_expr", {}).get("sql") == "orders.cid = customers.id AND orders.region = customers.region" + def test_dataset_attr_display_name_and_format_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "status", "name": "status", "datatype": "string", + "display_name": "Order Status", "hidden": True, "format_string": "##,###"}, + ], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["status"]["display_name"] == "Order Status" + assert attrs["status"]["hidden"] is True + assert attrs["status"]["format_string"] == "##,###" + + def test_calc_attr_honeydew_fields_preserved(self, tmp_path): + out_dir = self._roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * 0.9", + "display_name": "Discounted Price", + "timegrain": "day"}], + }], tmp_path) + calc = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) + assert calc["display_name"] == "Discounted Price" + assert calc["timegrain"] == "day" + + def test_entity_owner_and_display_name_preserved(self, tmp_path): + ws_path = tmp_path / "workspace.yml" + ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) + base = tmp_path / "schema" / "orders" + (base / "datasets").mkdir(parents=True) + (base / "orders.yml").write_text(yaml.dump({ + "type": "entity", "name": "orders", "keys": ["id"], + "key_dataset": "orders", "relations": [], + "owner": "analytics_team", "display_name": "Orders Table", + })) + (base / "datasets" / "orders.yml").write_text(yaml.dump({ + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", "attributes": [], + })) + osi_yaml = convert_honeydew_to_osi(str(tmp_path)) + files = convert_osi_to_honeydew(osi_yaml) + out_dir = tmp_path / "out" + for rel_path, content in files.items(): + p = out_dir / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + assert entity.get("owner") == "analytics_team" + assert entity.get("display_name") == "Orders Table" + def test_calc_attr_with_simple_identifier_sql_preserved(self, tmp_path): out_dir = self._roundtrip([{ "name": "orders", "keys": ["id"], "key_dataset": "orders", @@ -972,6 +1027,31 @@ def test_metric_string_ai_context_preserved_in_roundtrip(self, tmp_path): m = result["semantic_model"][0]["metrics"][0] assert m.get("ai_context") == "Use for revenue analysis" + def test_whitespace_expression_skipped(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": " "}]}, + "dimension": {"is_time": False}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + names = [a["name"] for a in ds["attributes"]] + assert "bad" not in names + assert "schema/orders/attributes/bad.yml" not in files + + def test_non_dict_expression_warns(self): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": "just_a_string", + "dimension": {"is_time": False}, + }]}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(_osi(model)) + assert any("must be a mapping" in str(x.message) for x in w) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert all(a["name"] != "bad" for a in ds["attributes"]) + def test_malformed_osi_metadata_json_warns(self, tmp_path): ws_path = tmp_path / "workspace.yml" ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) From b5e565cd45cf3a53846276864ab578184f475719 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 17:11:30 +0300 Subject: [PATCH 05/13] Refactor tests to pytest functions + parametrize; require Python 3.12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rewrite entire test file from classes to module-level pytest functions; parametrize repeated cases (is_simple_identifier, parse_osi_source, field datatypes, entity honeydew fields, dataset/calc attr fields, empty/whitespace expressions, path traversal guard, check_safe_path) - Extract _check_safe_path into a named helper in the converter (was inlined in main()) so path traversal logic is independently testable - Add parametrized test_check_safe_path covering ../evil.yml and ../../etc/passwd rejection and legitimate nested paths - Add test_empty_or_whitespace_metric_expression_skipped to cover the OSI→Honeydew whitespace guard on metrics (was previously untested) - Parametrize entity/attr/calc Honeydew field round-trip tests (owner, display_name, hidden, folder each verified independently) - Update _write_workspace helper to pass through entity-level fields (owner, display_name, hidden, folder) so round-trip tests use the standard _honeydew_roundtrip() helper instead of manual workspace setup - Promote _HD_ATTR_KEYS tuple to module-level constant (was defined inside the field loop on every iteration) - Drop from __future__ import annotations — requires Python 3.12+ - Add pyproject.toml with requires-python = ">=3.12" - Add Python 3.12+ requirement section to README Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/README.md | 5 + converters/honeydew/pyproject.toml | 25 + .../honeydew/src/honeydew_osi_converter.py | 27 +- .../tests/test_honeydew_osi_converter.py | 1813 +++++++++-------- 4 files changed, 968 insertions(+), 902 deletions(-) create mode 100644 converters/honeydew/pyproject.toml diff --git a/converters/honeydew/README.md b/converters/honeydew/README.md index 8f8a25e..103d19a 100644 --- a/converters/honeydew/README.md +++ b/converters/honeydew/README.md @@ -35,6 +35,11 @@ Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models a | `entity.relations` (`one-to-many`) | `relationship` with `from` = target entity | | `metric.sql` | `metric` expression in `ANSI_SQL` dialect | +## Requirements + +- Python 3.12+ +- PyYAML 6.0+ + ## Setup ```bash diff --git a/converters/honeydew/pyproject.toml b/converters/honeydew/pyproject.toml new file mode 100644 index 0000000..82df220 --- /dev/null +++ b/converters/honeydew/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "honeydew-osi" +version = "0.2.0.dev0" +description = "Bidirectional converter between Honeydew workspace YAML and OSI semantic model" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "pyyaml>=6.0", +] + +[dependency-groups] +dev = [ + "pytest>=8.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["src/**/*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index 25d9ab0..e082360 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -11,8 +11,6 @@ python honeydew_osi_converter.py honeydew-to-osi -i workspace_dir/ -o output.yaml """ -from __future__ import annotations - import argparse import json import os @@ -26,6 +24,7 @@ SUPPORTED_OSI_VERSION = "0.2.0.dev0" HONEYDEW_VENDOR = "HONEYDEW" _OSI_METADATA_SECTION = "osi" +_HD_ATTR_KEYS = ("display_name", "hidden", "folder", "format_string", "timegrain") class HoneydewConversionError(Exception): @@ -234,8 +233,6 @@ def _dataset_to_files( hd_hint = _get_honeydew_extension(field) force_calc = hd_hint.get("type") == "calculated_attribute" - _hd_attr_keys = ("display_name", "hidden", "folder", "format_string", "timegrain") - if _is_simple_identifier(expr) and not force_calc: attr: dict[str, Any] = {"column": expr, "name": field_name, "datatype": datatype} if effective_desc: @@ -244,9 +241,9 @@ def _dataset_to_files( attr["labels"] = labels if field_meta: attr["metadata"] = [field_meta] - for _k in _hd_attr_keys: - if _k in hd_hint: - attr[_k] = hd_hint[_k] + for k in _HD_ATTR_KEYS: + if k in hd_hint: + attr[k] = hd_hint[k] dataset_attrs.append(attr) else: calc: dict[str, Any] = { @@ -262,9 +259,9 @@ def _dataset_to_files( calc["labels"] = labels if field_meta: calc["metadata"] = [field_meta] - for _k in _hd_attr_keys: - if _k in hd_hint: - calc[_k] = hd_hint[_k] + for k in _HD_ATTR_KEYS: + if k in hd_hint: + calc[k] = hd_hint[k] calc_attrs.append(calc) # ── dataset YAML ─────────────────────────────────────────────────────────── @@ -931,6 +928,12 @@ def _dump(data: Any) -> str: return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True) +def _check_safe_path(output_abs: str, rel_path: str) -> bool: + """Return True iff the resolved path stays inside output_abs.""" + full = os.path.normpath(os.path.join(output_abs, rel_path)) + return full.startswith(output_abs + os.sep) + + # ───────────────────────────────────────────────────────────────────────────── # CLI # ───────────────────────────────────────────────────────────────────────────── @@ -963,10 +966,10 @@ def main() -> None: output_abs = os.path.abspath(args.output) for rel_path, content in files.items(): - full_path = os.path.normpath(os.path.join(output_abs, rel_path)) - if not full_path.startswith(output_abs + os.sep): + if not _check_safe_path(output_abs, rel_path): print(f"Error: refusing to write outside output directory: {rel_path}", file=sys.stderr) sys.exit(1) + full_path = os.path.normpath(os.path.join(output_abs, rel_path)) os.makedirs(os.path.dirname(full_path), exist_ok=True) with open(full_path, "w") as f: f.write(content) diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index 9115e7c..810661e 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -1,7 +1,5 @@ """Tests for the bidirectional OSI ↔ Honeydew converter.""" -from __future__ import annotations - import json import os import sys @@ -16,6 +14,7 @@ HoneydewConversionError, _assign_metrics_to_entities, _build_osi_metadata, + _check_safe_path, _find_entity_in_expression, _honeydew_datatype_to_osi_dimension, _is_simple_identifier, @@ -90,6 +89,9 @@ def _write_workspace(tmp_dir, workspace_name, entities): "key_dataset": e.get("key_dataset", ename), "relations": e.get("relations", []), } + for k in ("owner", "display_name", "hidden", "folder"): + if k in e: + entity_dict[k] = e[k] with open(os.path.join(base, f"{ename}.yml"), "w") as f: yaml.dump(entity_dict, f) @@ -114,962 +116,993 @@ def _write_workspace(tmp_dir, workspace_name, entities): yaml.dump(m, f) -# ───────────────────────────────────────────────────────────────────────────── -# Unit tests – helpers -# ───────────────────────────────────────────────────────────────────────────── - -class TestIsSimpleIdentifier: - def test_plain_name(self): - assert _is_simple_identifier("order_id") is True - - def test_with_spaces(self): - assert _is_simple_identifier("SUM(x)") is False - - def test_with_dot(self): - assert _is_simple_identifier("orders.id") is False - - def test_leading_number(self): - assert _is_simple_identifier("1col") is False - - def test_underscore_prefix(self): - assert _is_simple_identifier("_hidden") is True - - -class TestParseOsiSource: - def test_table_reference(self): - sql, dtype = _parse_osi_source("db.schema.table") - assert sql == "db.schema.table" and dtype == "table" +def _osi_roundtrip(model_dict, tmp_path): + """OSI → Honeydew → OSI; returns the semantic model dict.""" + files = convert_osi_to_honeydew(_osi(model_dict)) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + return yaml.safe_load(convert_honeydew_to_osi(str(tmp_path)))["semantic_model"][0] - def test_select_query(self): - _, dtype = _parse_osi_source("SELECT id FROM foo") - assert dtype == "sql" - def test_with_query(self): - _, dtype = _parse_osi_source("WITH cte AS (SELECT 1) SELECT * FROM cte") - assert dtype == "sql" +def _honeydew_roundtrip(entities, tmp_path): + """Honeydew → OSI → Honeydew; returns Path to the output workspace directory.""" + _write_workspace(str(tmp_path), "ws", entities) + osi_yaml = convert_honeydew_to_osi(str(tmp_path)) + files = convert_osi_to_honeydew(osi_yaml) + out_dir = tmp_path / "out" + for rel_path, content in files.items(): + p = out_dir / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + return out_dir - def test_empty(self): - sql, dtype = _parse_osi_source("") - assert sql == "" and dtype == "table" - - -class TestPickAnsiExpression: - def test_ansi_preferred(self): - expr = {"dialects": [ - {"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}, - {"dialect": "ANSI_SQL", "expression": "col"}, - ]} - assert _pick_ansi_expression(expr, "f") == "col" - - def test_fallback_to_first(self): - expr = {"dialects": [{"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}]} - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - result = _pick_ansi_expression(expr, "f") - assert result == "col::VARCHAR" - assert any("ANSI_SQL" in str(x.message) for x in w) - - def test_none_on_missing(self): - assert _pick_ansi_expression(None, "f") is None - assert _pick_ansi_expression({"dialects": []}, "f") is None +# ───────────────────────────────────────────────────────────────────────────── +# Unit tests – helpers +# ───────────────────────────────────────────────────────────────────────────── -class TestOsiFieldDatatypes: - def test_time_dimension(self): - assert _osi_field_to_honeydew_datatype({"dimension": {"is_time": True}}) == "timestamp" +@pytest.mark.parametrize("expr,expected", [ + ("order_id", True), + ("SUM(x)", False), + ("orders.id", False), + ("1col", False), + ("_hidden", True), +]) +def test_is_simple_identifier(expr, expected): + assert _is_simple_identifier(expr) is expected + + +@pytest.mark.parametrize("source,expected_sql,expected_type", [ + ("db.schema.table", "db.schema.table", "table"), + ("SELECT id FROM foo", "SELECT id FROM foo", "sql"), + ("WITH cte AS (SELECT 1) SELECT * FROM cte", "WITH cte AS (SELECT 1) SELECT * FROM cte", "sql"), + ("", "", "table"), +]) +def test_parse_osi_source(source, expected_sql, expected_type): + sql, dtype = _parse_osi_source(source) + assert sql == expected_sql and dtype == expected_type + + +@pytest.mark.parametrize("field,expected_dt", [ + ({"dimension": {"is_time": True}}, "timestamp"), + ({"dimension": {"is_time": False}}, "string"), + ({}, "number"), +]) +def test_osi_field_to_honeydew_datatype(field, expected_dt): + assert _osi_field_to_honeydew_datatype(field) == expected_dt + + +@pytest.mark.parametrize("datatype,expected_dim", [ + ("date", {"is_time": True}), + ("timestamp", {"is_time": True}), + ("string", {"is_time": False}), + ("bool", {"is_time": False}), + ("number", None), + ("float", None), +]) +def test_honeydew_datatype_to_osi_dimension(datatype, expected_dim): + assert _honeydew_datatype_to_osi_dimension(datatype) == expected_dim + + +@pytest.mark.parametrize("expr,entities,expected", [ + ("SUM(orders.total)", {"orders", "customers"}, "orders"), + ("orders.a / customers.b", {"orders", "customers"}, "orders"), + ("COUNT(*)", {"orders"}, None), + ("SUM(foo.col)", {"orders"}, None), +]) +def test_find_entity_in_expression(expr, entities, expected): + assert _find_entity_in_expression(expr, entities) == expected + + +def test_pick_ansi_expression_ansi_preferred(): + expr = {"dialects": [ + {"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}, + {"dialect": "ANSI_SQL", "expression": "col"}, + ]} + assert _pick_ansi_expression(expr, "f") == "col" + + +def test_pick_ansi_expression_fallback_warns(): + expr = {"dialects": [{"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = _pick_ansi_expression(expr, "f") + assert result == "col::VARCHAR" + assert any("ANSI_SQL" in str(x.message) for x in w) + + +@pytest.mark.parametrize("expression", [None, {"dialects": []}]) +def test_pick_ansi_expression_returns_none(expression): + assert _pick_ansi_expression(expression, "f") is None + + +def test_pick_ansi_expression_non_dict_warns(): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = _pick_ansi_expression("just_a_string", "f") + assert result is None + assert any("must be a mapping" in str(x.message) for x in w) - def test_dimension(self): - assert _osi_field_to_honeydew_datatype({"dimension": {"is_time": False}}) == "string" - def test_fact(self): - assert _osi_field_to_honeydew_datatype({}) == "number" +# ───────────────────────────────────────────────────────────────────────────── +# OSI metadata helpers +# ───────────────────────────────────────────────────────────────────────────── +def test_build_and_read_ai_context_string(): + section = _build_osi_metadata(ai_context="orders, purchases") + result = _read_osi_metadata({"metadata": [section]}) + assert result["ai_context"] == "orders, purchases" -class TestHoneydewDatatypeToOsiDimension: - def test_date(self): - assert _honeydew_datatype_to_osi_dimension("date") == {"is_time": True} - def test_timestamp(self): - assert _honeydew_datatype_to_osi_dimension("timestamp") == {"is_time": True} +def test_build_and_read_ai_context_dict(): + ctx = {"instructions": "Use for sales", "synonyms": ["orders", "purchases"]} + section = _build_osi_metadata(ai_context=ctx) + result = _read_osi_metadata({"metadata": [section]}) + assert result["ai_context"] == ctx - def test_string(self): - assert _honeydew_datatype_to_osi_dimension("string") == {"is_time": False} - def test_bool(self): - assert _honeydew_datatype_to_osi_dimension("bool") == {"is_time": False} +def test_build_and_read_unique_keys(): + uks = [["col1", "col2"], ["col3"]] + section = _build_osi_metadata(unique_keys=uks) + result = _read_osi_metadata({"metadata": [section]}) + assert result["unique_keys"] == uks - def test_number(self): - assert _honeydew_datatype_to_osi_dimension("number") is None - def test_float(self): - assert _honeydew_datatype_to_osi_dimension("float") is None +def test_build_and_read_custom_extensions(): + exts = [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}] + section = _build_osi_metadata(custom_extensions=exts) + result = _read_osi_metadata({"metadata": [section]}) + assert result["custom_extensions"] == exts -class TestFindEntityInExpression: - def test_finds_entity(self): - assert _find_entity_in_expression("SUM(orders.total)", {"orders", "customers"}) == "orders" +def test_read_osi_metadata_no_osi_section(): + assert _read_osi_metadata({"metadata": [{"name": "other", "metadata": []}]}) == {} - def test_returns_first_match(self): - result = _find_entity_in_expression("orders.a / customers.b", {"orders", "customers"}) - assert result == "orders" - def test_no_match(self): - assert _find_entity_in_expression("COUNT(*)", {"orders"}) is None +def test_read_osi_metadata_no_metadata(): + assert _read_osi_metadata({}) == {} - def test_ignores_non_entity_prefixes(self): - assert _find_entity_in_expression("SUM(foo.col)", {"orders"}) is None +def test_build_osi_metadata_nothing_to_store(): + assert _build_osi_metadata() is None -class TestOsiMetadataHelpers: - def test_build_and_read_ai_context_string(self): - section = _build_osi_metadata(ai_context="orders, purchases") - obj = {"metadata": [section]} - result = _read_osi_metadata(obj) - assert result["ai_context"] == "orders, purchases" - def test_build_and_read_ai_context_dict(self): - ctx = {"instructions": "Use for sales", "synonyms": ["orders", "purchases"]} - section = _build_osi_metadata(ai_context=ctx) - obj = {"metadata": [section]} - result = _read_osi_metadata(obj) - assert result["ai_context"] == ctx +# ───────────────────────────────────────────────────────────────────────────── +# Assign metrics to entities +# ───────────────────────────────────────────────────────────────────────────── - def test_build_and_read_unique_keys(self): - uks = [["col1", "col2"], ["col3"]] - section = _build_osi_metadata(unique_keys=uks) - obj = {"metadata": [section]} - result = _read_osi_metadata(obj) - assert result["unique_keys"] == uks +def test_assign_metrics_by_expression(): + metrics = [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}] + result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) + assert "total" in [m["name"] for m in result.get("orders", [])] - def test_build_and_read_custom_extensions(self): - exts = [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}] - section = _build_osi_metadata(custom_extensions=exts) - obj = {"metadata": [section]} - result = _read_osi_metadata(obj) - assert result["custom_extensions"] == exts - def test_returns_empty_when_no_osi_section(self): - obj = {"metadata": [{"name": "other", "metadata": []}]} - assert _read_osi_metadata(obj) == {} +def test_assign_metrics_honeydew_hint_takes_priority(): + metrics = [{ + "name": "cnt", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], + }] + result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) + assert "cnt" in [m["name"] for m in result.get("customers", [])] + assert "orders" not in result - def test_returns_empty_when_no_metadata(self): - assert _read_osi_metadata({}) == {} - def test_build_returns_none_when_nothing_to_store(self): - assert _build_osi_metadata() is None +def test_assign_metrics_fallback_to_first_entity(): + metrics = [{"name": "cnt", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] + with warnings.catch_warnings(record=True): + result = _assign_metrics_to_entities(metrics, ["orders"]) + assert "cnt" in [m["name"] for m in result.get("orders", [])] -class TestAssignMetricsToEntities: - def test_assigns_by_expression(self): - metrics = [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}] - result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) - assert "total" in [m["name"] for m in result.get("orders", [])] +def test_assign_metrics_no_entities(): + metrics = [{"name": "m", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] + with warnings.catch_warnings(record=True): + result = _assign_metrics_to_entities(metrics, []) + assert result == {} - def test_honeydew_hint_takes_priority(self): - metrics = [{ - "name": "cnt", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, - "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], - }] - result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) - # hint says customers even though expression references orders - assert "cnt" in [m["name"] for m in result.get("customers", [])] - assert "orders" not in result - def test_falls_back_to_first_entity(self): - metrics = [{"name": "cnt", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] - with warnings.catch_warnings(record=True): - result = _assign_metrics_to_entities(metrics, ["orders"]) - assert "cnt" in [m["name"] for m in result.get("orders", [])] +# ───────────────────────────────────────────────────────────────────────────── +# Path traversal guard +# ───────────────────────────────────────────────────────────────────────────── - def test_no_entities(self): - metrics = [{"name": "m", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] - with warnings.catch_warnings(record=True): - result = _assign_metrics_to_entities(metrics, []) - assert result == {} +@pytest.mark.parametrize("rel_path,expected", [ + ("workspace.yml", True), + ("schema/orders/orders.yml", True), + ("schema/orders/datasets/orders.yml", True), + ("../evil.yml", False), + ("../../etc/passwd", False), + ("schema/../../../evil", False), +]) +def test_check_safe_path(rel_path, expected): + output_abs = os.path.abspath("/tmp/test_output") + assert _check_safe_path(output_abs, rel_path) is expected # ───────────────────────────────────────────────────────────────────────────── # OSI → Honeydew integration tests # ───────────────────────────────────────────────────────────────────────────── -class TestOsiToHoneydew: - def test_workspace_yml_created(self): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ws = yaml.safe_load(files["workspace.yml"]) - assert ws["name"] == "test_model" and ws["type"] == "workspace" - - def test_entity_yml_created(self): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - assert entity["name"] == "orders" - assert entity["keys"] == ["order_id"] - assert entity["key_dataset"] == "orders" - - def test_dataset_yml_created(self): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert ds["sql"] == "db.schema.orders" - assert ds["dataset_type"] == "table" - - def test_simple_fields_become_dataset_attributes(self): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - names = [a["name"] for a in ds["attributes"]] - assert "order_id" in names and "order_date" in names and "total" in names - - def test_time_field_gets_timestamp_datatype(self): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["order_date"]["datatype"] == "timestamp" - - def test_fact_field_gets_number_datatype(self): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["total"]["datatype"] == "number" - - def test_complex_expression_becomes_calculated_attribute(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "disc_price", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, - "dimension": {"is_time": False}, - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - assert "schema/orders/attributes/disc_price.yml" in files - calc = yaml.safe_load(files["schema/orders/attributes/disc_price.yml"]) - assert calc["type"] == "calculated_attribute" - assert calc["sql"] == "price * (1 - discount)" - - def test_label_mapped_to_honeydew_labels(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "status", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "dimension": {"is_time": False}, - "label": "sales", - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "sales" in attrs["status"]["labels"] - - def test_ai_context_string_merged_into_description(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "description": "Base desc", - "ai_context": "revenue, earnings", - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "revenue, earnings" in attrs["total"]["description"] - - def test_ai_context_dict_instructions_merged_into_description(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}, - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "Use for revenue" in attrs["total"]["description"] - assert "rev" in attrs["total"]["labels"] - - def test_ai_context_dict_stored_in_metadata(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev"]}, - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attr = next(a for a in ds["attributes"] if a["name"] == "total") - # Should be in the osi metadata section - osi_section = next((s for s in attr.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - ai_item = next((i for i in osi_section["metadata"] if i["name"] == "ai_context"), None) - assert ai_item is not None - - def test_unique_keys_stored_in_entity_metadata(self): - model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", - "primary_key": ["item_id"], - "unique_keys": [["sku"], ["item_id", "variant"]], - "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/items/items.yml"]) - osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - uk_item = next((i for i in osi_section["metadata"] if i["name"] == "unique_keys"), None) - assert uk_item is not None - assert json.loads(uk_item["value"]) == [["sku"], ["item_id", "variant"]] - - def test_non_honeydew_custom_extensions_stored_in_metadata(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], - "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - ext_item = next((i for i in osi_section["metadata"] if i["name"] == "custom_extensions"), None) - assert ext_item is not None - exts = json.loads(ext_item["value"]) - assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) - - def test_relationship_name_stored_in_relation(self): - model = {"name": "m", "datasets": [ +def test_osi_to_honeydew_workspace_yml(): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ws = yaml.safe_load(files["workspace.yml"]) + assert ws["name"] == "test_model" and ws["type"] == "workspace" + + +def test_osi_to_honeydew_entity_yml(): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + assert entity["name"] == "orders" + assert entity["keys"] == ["order_id"] + assert entity["key_dataset"] == "orders" + + +def test_osi_to_honeydew_dataset_yml(): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert ds["sql"] == "db.schema.orders" + assert ds["dataset_type"] == "table" + + +def test_osi_to_honeydew_simple_fields_become_dataset_attributes(): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + names = [a["name"] for a in ds["attributes"]] + assert "order_id" in names and "order_date" in names and "total" in names + + +@pytest.mark.parametrize("field_name,expected_dt", [ + ("order_date", "timestamp"), + ("total", "number"), +]) +def test_osi_to_honeydew_field_datatypes(field_name, expected_dt): + files = convert_osi_to_honeydew(_osi(_minimal_model())) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs[field_name]["datatype"] == expected_dt + + +def test_osi_to_honeydew_complex_expression_becomes_calculated_attribute(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "disc_price", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, + "dimension": {"is_time": False}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/orders/attributes/disc_price.yml" in files + calc = yaml.safe_load(files["schema/orders/attributes/disc_price.yml"]) + assert calc["type"] == "calculated_attribute" + assert calc["sql"] == "price * (1 - discount)" + + +def test_osi_to_honeydew_label_mapped_to_honeydew_labels(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}, + "label": "sales", + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "sales" in attrs["status"]["labels"] + + +def test_osi_to_honeydew_ai_context_string_merged_into_description(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "description": "Base desc", + "ai_context": "revenue, earnings", + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "revenue, earnings" in attrs["total"]["description"] + + +def test_osi_to_honeydew_ai_context_dict_instructions_merged_into_description(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "Use for revenue" in attrs["total"]["description"] + assert "rev" in attrs["total"]["labels"] + + +def test_osi_to_honeydew_ai_context_dict_stored_in_metadata(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev"]}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + attr = next(a for a in ds["attributes"] if a["name"] == "total") + osi_section = next((s for s in attr.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + assert any(i["name"] == "ai_context" for i in osi_section["metadata"]) + + +def test_osi_to_honeydew_unique_keys_stored_in_entity_metadata(): + model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["item_id"], + "unique_keys": [["sku"], ["item_id", "variant"]], + "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/items/items.yml"]) + osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + uk_item = next((i for i in osi_section["metadata"] if i["name"] == "unique_keys"), None) + assert uk_item is not None + assert json.loads(uk_item["value"]) == [["sku"], ["item_id", "variant"]] + + +def test_osi_to_honeydew_non_honeydew_extensions_stored_in_metadata(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) + assert osi_section is not None + ext_item = next((i for i in osi_section["metadata"] if i["name"] == "custom_extensions"), None) + assert ext_item is not None + exts = json.loads(ext_item["value"]) + assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) + + +def test_osi_to_honeydew_relationship_name_stored_in_relation(): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + assert entity["relations"][0]["name"] == "orders_to_customers" + + +def test_osi_to_honeydew_model_ai_context_stored_in_workspace_metadata(): + model = {"name": "m", "datasets": [], + "ai_context": {"instructions": "Use for retail analytics", "synonyms": ["store"]}} + files = convert_osi_to_honeydew(_osi(model)) + ws = yaml.safe_load(files["workspace.yml"]) + assert any(s["name"] == "osi" for s in ws.get("metadata", [])) + + +def test_osi_to_honeydew_relationship_on_from_entity_only(): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + files = convert_osi_to_honeydew(_osi(model)) + orders = yaml.safe_load(files["schema/orders/orders.yml"]) + customers = yaml.safe_load(files["schema/customers/customers.yml"]) + assert len(orders["relations"]) == 1 + assert customers["relations"] == [] + rel = orders["relations"][0] + assert rel["target_entity"] == "customers" and rel["rel_type"] == "many-to-one" + assert rel["connection"] == [{"src_field": "cid", "target_field": "id"}] + + +def test_osi_to_honeydew_metric_assigned_by_expression_entity(): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/orders/metrics/total.yml" in files + + +def test_osi_to_honeydew_metric_entity_hint_overrides_expression(): + model = {"name": "m", + "datasets": [ {"name": "orders", "source": "db.s.orders", "fields": []}, {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - assert entity["relations"][0]["name"] == "orders_to_customers" + ], + "metrics": [{ + "name": "cnt", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], + }]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/customers/metrics/cnt.yml" in files + assert "schema/orders/metrics/cnt.yml" not in files - def test_model_ai_context_stored_in_workspace_metadata(self): - model = {"name": "m", "datasets": [], - "ai_context": {"instructions": "Use for retail analytics", "synonyms": ["store"]}} - files = convert_osi_to_honeydew(_osi(model)) - ws = yaml.safe_load(files["workspace.yml"]) - osi_section = next((s for s in ws.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - def test_relationship_added_to_entity(self): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "r", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - assert len(entity["relations"]) == 1 - rel = entity["relations"][0] - assert rel["target_entity"] == "customers" - assert rel["rel_type"] == "many-to-one" - assert rel["connection"] == [{"src_field": "cid", "target_field": "id"}] - - def test_to_entity_has_no_relation(self): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "r", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/customers/customers.yml"]) - assert entity["relations"] == [] +def test_osi_to_honeydew_invalid_version_raises(): + with pytest.raises(HoneydewConversionError, match="Unsupported"): + convert_osi_to_honeydew("version: '9.9.9'\nsemantic_model:\n - name: m\n") - def test_metric_assigned_by_expression_entity(self): - model = {"name": "m", - "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], - "metrics": [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} - files = convert_osi_to_honeydew(_osi(model)) - assert "schema/orders/metrics/total.yml" in files - - def test_metric_entity_hint_overrides_expression(self): - model = {"name": "m", - "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], - "metrics": [{ - "name": "cnt", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, - "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], - }]} - files = convert_osi_to_honeydew(_osi(model)) - assert "schema/customers/metrics/cnt.yml" in files - assert "schema/orders/metrics/cnt.yml" not in files - def test_invalid_version_raises(self): - with pytest.raises(HoneydewConversionError, match="Unsupported"): - convert_osi_to_honeydew("version: '9.9.9'\nsemantic_model:\n - name: m\n") +def test_osi_to_honeydew_missing_semantic_model_raises(): + with pytest.raises(HoneydewConversionError): + convert_osi_to_honeydew(f"version: '{OSI_VERSION}'\n") - def test_missing_semantic_model_raises(self): - with pytest.raises(HoneydewConversionError): - convert_osi_to_honeydew(f"version: '{OSI_VERSION}'\n") - def test_subquery_source_uses_sql_type(self): - model = {"name": "m", "datasets": [{"name": "orders", - "source": "SELECT * FROM raw.orders WHERE active = true", "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert ds["dataset_type"] == "sql" +def test_osi_to_honeydew_subquery_source_uses_sql_type(): + model = {"name": "m", "datasets": [{"name": "orders", + "source": "SELECT * FROM raw.orders WHERE active = true", "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert ds["dataset_type"] == "sql" - def test_composite_primary_key(self): - model = {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", - "primary_key": ["order_id", "line_number"], "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/li/li.yml"]) - assert entity["keys"] == ["order_id", "line_number"] - def test_multiple_semantic_models_warns(self): - doc = yaml.dump({"version": OSI_VERSION, "semantic_model": [ - {"name": "m1", "datasets": []}, - {"name": "m2", "datasets": []}, - ]}) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - files = convert_osi_to_honeydew(doc) - assert any("only the first" in str(x.message) for x in w) - assert yaml.safe_load(files["workspace.yml"])["name"] == "m1" +def test_osi_to_honeydew_composite_primary_key(): + model = {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", + "primary_key": ["order_id", "line_number"], "fields": []}]} + files = convert_osi_to_honeydew(_osi(model)) + entity = yaml.safe_load(files["schema/li/li.yml"]) + assert entity["keys"] == ["order_id", "line_number"] + + +def test_osi_to_honeydew_multiple_models_warns(): + doc = yaml.dump({"version": OSI_VERSION, "semantic_model": [ + {"name": "m1", "datasets": []}, + {"name": "m2", "datasets": []}, + ]}) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(doc) + assert any("only the first" in str(x.message) for x in w) + assert yaml.safe_load(files["workspace.yml"])["name"] == "m1" # ───────────────────────────────────────────────────────────────────────────── # Honeydew → OSI integration tests # ───────────────────────────────────────────────────────────────────────────── -class TestHoneydewToOsi: - def test_basic_conversion(self, tmp_path): - _write_workspace(str(tmp_path), "tpch", [{ - "name": "orders", "keys": ["orderkey"], "key_dataset": "tpch_orders", - "sql": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", - "dataset_attrs": [ - {"column": "o_orderkey", "name": "orderkey", "datatype": "number"}, - {"column": "o_orderdate", "name": "orderdate", "datatype": "date"}, - ], - }]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - sm = result["semantic_model"][0] - assert sm["name"] == "tpch" - ds = sm["datasets"][0] - assert ds["source"] == "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS" - assert ds["primary_key"] == ["orderkey"] - - def test_field_types_from_datatypes(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", - "dataset_attrs": [ - {"column": "id", "name": "id", "datatype": "number"}, - {"column": "status", "name": "status", "datatype": "string"}, - {"column": "created_at", "name": "created_at", "datatype": "timestamp"}, - ]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} - assert fields["id"].get("dimension") is None - assert fields["status"]["dimension"] == {"is_time": False} - assert fields["created_at"]["dimension"] == {"is_time": True} - - def test_labels_become_osi_label_and_ai_context(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", - "dataset_attrs": [ - {"column": "status", "name": "status", "datatype": "string", - "labels": ["sales", "reporting"]}, - ]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - f = next(f for f in result["semantic_model"][0]["datasets"][0]["fields"] if f["name"] == "status") - assert f["label"] == "sales" - assert "sales" in (f.get("ai_context") or {}).get("synonyms", []) - - def test_many_to_one_relation_to_osi(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [ - {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection": [{"src_field": "customer_id", "target_field": "id"}]}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", "dataset_attrs": []}, - ]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - rels = result["semantic_model"][0]["relationships"] - assert len(rels) == 1 - assert rels[0]["from"] == "orders" and rels[0]["to"] == "customers" - - def test_one_to_many_direction_flipped(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [ - {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", - "relations": [{"target_entity": "orders", "rel_type": "one-to-many", - "connection": [{"src_field": "id", "target_field": "customer_id"}]}], - "dataset_attrs": []}, - {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": []}, - ]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - rel = result["semantic_model"][0]["relationships"][0] - assert rel["from"] == "orders" and rel["to"] == "customers" - - def test_duplicate_relations_deduplicated(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [ - {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection": [{"src_field": "cid", "target_field": "id"}]}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", - "relations": [{"target_entity": "orders", "rel_type": "one-to-many", - "connection": [{"src_field": "id", "target_field": "cid"}]}], - "dataset_attrs": []}, - ]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert len(result["semantic_model"][0].get("relationships", [])) == 1 - - def test_metrics_converted(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "count", - "datatype": "number", "sql": "COUNT(*)"}]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - m = result["semantic_model"][0]["metrics"][0] - assert m["name"] == "count" - assert m["expression"]["dialects"][0]["expression"] == "COUNT(*)" - - def test_metric_entity_preserved_in_custom_extension(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", - "datatype": "number", "sql": "COUNT(*)"}]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - m = result["semantic_model"][0]["metrics"][0] - ext = m["custom_extensions"][0] - assert ext["vendor_name"] == "HONEYDEW" - assert json.loads(ext["data"])["entity"] == "orders" - - def test_calculated_attribute_as_field(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "discounted", "datatype": "number", - "sql": "orders.price * (1 - orders.discount)"}]}]) +def test_honeydew_to_osi_basic(tmp_path): + _write_workspace(str(tmp_path), "tpch", [{ + "name": "orders", "keys": ["orderkey"], "key_dataset": "tpch_orders", + "sql": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", + "dataset_attrs": [ + {"column": "o_orderkey", "name": "orderkey", "datatype": "number"}, + {"column": "o_orderdate", "name": "orderdate", "datatype": "date"}, + ], + }]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + sm = result["semantic_model"][0] + assert sm["name"] == "tpch" + ds = sm["datasets"][0] + assert ds["source"] == "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS" + assert ds["primary_key"] == ["orderkey"] + + +@pytest.mark.parametrize("col_name,datatype,expected_dim", [ + ("id", "number", None), + ("status", "string", {"is_time": False}), + ("created_at", "timestamp", {"is_time": True}), +]) +def test_honeydew_to_osi_field_types(tmp_path, col_name, datatype, expected_dim): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": col_name, "name": col_name, "datatype": datatype}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} + assert fields[col_name].get("dimension") == expected_dim + + +def test_honeydew_to_osi_labels_become_label_and_ai_context(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [ + {"column": "status", "name": "status", "datatype": "string", + "labels": ["sales", "reporting"]}, + ]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + f = next(f for f in result["semantic_model"][0]["datasets"][0]["fields"] if f["name"] == "status") + assert f["label"] == "sales" + assert "sales" in (f.get("ai_context") or {}).get("synonyms", []) + + +def test_honeydew_to_osi_many_to_one_relation(tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "customer_id", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + rels = result["semantic_model"][0]["relationships"] + assert len(rels) == 1 + assert rels[0]["from"] == "orders" and rels[0]["to"] == "customers" + + +def test_honeydew_to_osi_one_to_many_direction_flipped(tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "customer_id"}]}], + "dataset_attrs": []}, + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + rel = result["semantic_model"][0]["relationships"][0] + assert rel["from"] == "orders" and rel["to"] == "customers" + + +def test_honeydew_to_osi_duplicate_relations_deduplicated(tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "cid"}]}], + "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert len(result["semantic_model"][0].get("relationships", [])) == 1 + + +def test_honeydew_to_osi_metric_converted(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "count", + "datatype": "number", "sql": "COUNT(*)"}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + m = result["semantic_model"][0]["metrics"][0] + assert m["name"] == "count" + assert m["expression"]["dialects"][0]["expression"] == "COUNT(*)" + + +def test_honeydew_to_osi_metric_entity_preserved_in_extension(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + m = result["semantic_model"][0]["metrics"][0] + ext = m["custom_extensions"][0] + assert ext["vendor_name"] == "HONEYDEW" + assert json.loads(ext["data"])["entity"] == "orders" + + +def test_honeydew_to_osi_calculated_attribute_as_field(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "discounted", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}]}]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} + assert "discounted" in fields + assert "orders.price" in fields["discounted"]["expression"]["dialects"][0]["expression"] + + +def test_honeydew_to_osi_missing_workspace_raises(tmp_path): + with pytest.raises(HoneydewConversionError, match="workspace.yml"): + convert_honeydew_to_osi(str(tmp_path)) + + +def test_honeydew_to_osi_missing_schema_dir_empty_model(tmp_path): + (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert result["semantic_model"][0]["datasets"] == [] + + +def test_honeydew_to_osi_vendors_includes_honeydew(tmp_path): + (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert "HONEYDEW" in result.get("vendors", []) + + +def test_honeydew_to_osi_empty_metric_sql_skipped(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "bad", + "datatype": "number", "sql": ""}]}]) + with warnings.catch_warnings(record=True): result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} - assert "discounted" in fields - assert "orders.price" in fields["discounted"]["expression"]["dialects"][0]["expression"] - - def test_missing_workspace_yml_raises(self, tmp_path): - with pytest.raises(HoneydewConversionError, match="workspace.yml"): - convert_honeydew_to_osi(str(tmp_path)) + assert "metrics" not in result["semantic_model"][0] - def test_missing_schema_dir_produces_empty_model(self, tmp_path): - (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert result["semantic_model"][0]["datasets"] == [] - def test_vendors_includes_honeydew(self, tmp_path): - (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert "HONEYDEW" in result.get("vendors", []) +# ───────────────────────────────────────────────────────────────────────────── +# OSI → Honeydew → OSI round-trip tests +# ───────────────────────────────────────────────────────────────────────────── - def test_empty_metrics_skipped(self, tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "bad", - "datatype": "number", "sql": ""}]}]) - with warnings.catch_warnings(record=True): - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert "metrics" not in result["semantic_model"][0] +def test_osi_roundtrip_name_and_description(tmp_path): + model = {"name": "retail", "description": "Retail model", "datasets": []} + sm = _osi_roundtrip(model, tmp_path) + assert sm["name"] == "retail" and sm["description"] == "Retail model" + + +@pytest.mark.parametrize("primary_key", [ + ["order_id"], + ["order_id", "line_no"], +]) +def test_osi_roundtrip_primary_key(tmp_path, primary_key): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": primary_key, "fields": []}]} + sm = _osi_roundtrip(model, tmp_path) + assert sm["datasets"][0]["primary_key"] == primary_key + + +def test_osi_roundtrip_unique_keys(tmp_path): + model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["id"], + "unique_keys": [["sku"], ["id", "variant"]], + "fields": []}]} + sm = _osi_roundtrip(model, tmp_path) + assert sm["datasets"][0]["unique_keys"] == [["sku"], ["id", "variant"]] + + +def test_osi_roundtrip_field_label(tmp_path): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "label": "sales", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}}]}]} + sm = _osi_roundtrip(model, tmp_path) + f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") + assert f["label"] == "sales" + + +def test_osi_roundtrip_ai_context_string(tmp_path): + ai_ctx_value = "order status, order state" + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "ai_context": ai_ctx_value, + "dimension": {"is_time": False}}]}]} + sm = _osi_roundtrip(model, tmp_path) + f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") + # String ai_context is merged into description on OSI→Honeydew; value must be recoverable + assert ai_ctx_value in (f.get("description") or "") or f.get("ai_context") == ai_ctx_value + + +def test_osi_roundtrip_ai_context_dict(tmp_path): + ctx = {"instructions": "Use for revenue analysis", "synonyms": ["revenue", "sales"]} + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": ctx}]}]} + sm = _osi_roundtrip(model, tmp_path) + f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "total") + assert f.get("ai_context") == ctx + + +def test_osi_roundtrip_model_ai_context(tmp_path): + ctx = {"instructions": "Retail analytics", "synonyms": ["store"]} + model = {"name": "m", "ai_context": ctx, "datasets": []} + sm = _osi_roundtrip(model, tmp_path) + assert sm.get("ai_context") == ctx + + +def test_osi_roundtrip_non_honeydew_custom_extensions(tmp_path): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]} + sm = _osi_roundtrip(model, tmp_path) + exts = sm["datasets"][0].get("custom_extensions") or [] + assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) + + +def test_osi_roundtrip_relationship_name(tmp_path): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + sm = _osi_roundtrip(model, tmp_path) + assert sm["relationships"][0]["name"] == "orders_to_customers" + + +def test_osi_roundtrip_relationship_columns(tmp_path): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]} + sm = _osi_roundtrip(model, tmp_path) + rel = sm["relationships"][0] + assert rel["from_columns"] == ["cid"] and rel["to_columns"] == ["id"] + + +def test_osi_roundtrip_metric(tmp_path): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total_revenue", "description": "Sum of sales", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + sm = _osi_roundtrip(model, tmp_path) + m = sm["metrics"][0] + assert m["name"] == "total_revenue" + assert m["expression"]["dialects"][0]["expression"] == "SUM(orders.total)" + assert m["description"] == "Sum of sales" + + +def test_osi_roundtrip_tpcds_example(tmp_path): + tpcds_path = ( + Path(__file__).resolve().parent.parent.parent.parent + / "examples" / "tpcds_semantic_model.yaml" + ) + if not tpcds_path.exists(): + pytest.skip("TPC-DS example not found") + osi_yaml = tpcds_path.read_text() + files = convert_osi_to_honeydew(osi_yaml) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + sm = result["semantic_model"][0] + assert sm["name"] == "tpcds_retail_model" + ds_names = {ds["name"] for ds in sm["datasets"]} + assert "store_sales" in ds_names and "customer" in ds_names # ───────────────────────────────────────────────────────────────────────────── -# Round-trip tests (idempotency) +# Honeydew → OSI → Honeydew round-trip tests # ───────────────────────────────────────────────────────────────────────────── -class TestOsiToHoneydewToOsiRoundTrip: - """OSI → Honeydew → OSI: verify key fields survive both legs.""" - - def _roundtrip(self, model_dict, tmp_path): - files = convert_osi_to_honeydew(_osi(model_dict)) - for rel_path, content in files.items(): - p = tmp_path / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - return yaml.safe_load(convert_honeydew_to_osi(str(tmp_path)))["semantic_model"][0] - - def test_name_and_description_preserved(self, tmp_path): - model = {"name": "retail", "description": "Retail model", "datasets": []} - sm = self._roundtrip(model, tmp_path) - assert sm["name"] == "retail" - assert sm["description"] == "Retail model" - - def test_primary_key_preserved(self, tmp_path): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "primary_key": ["order_id"], "fields": []}]} - sm = self._roundtrip(model, tmp_path) - assert sm["datasets"][0]["primary_key"] == ["order_id"] - - def test_composite_primary_key_preserved(self, tmp_path): - model = {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", - "primary_key": ["order_id", "line_no"], "fields": []}]} - sm = self._roundtrip(model, tmp_path) - assert sm["datasets"][0]["primary_key"] == ["order_id", "line_no"] - - def test_unique_keys_preserved(self, tmp_path): - model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", - "primary_key": ["id"], - "unique_keys": [["sku"], ["id", "variant"]], - "fields": []}]} - sm = self._roundtrip(model, tmp_path) - assert sm["datasets"][0]["unique_keys"] == [["sku"], ["id", "variant"]] - - def test_field_label_preserved(self, tmp_path): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "fields": [{"name": "status", "label": "sales", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "dimension": {"is_time": False}}]}]} - sm = self._roundtrip(model, tmp_path) - f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") - assert f["label"] == "sales" - - def test_ai_context_string_preserved(self, tmp_path): - ai_ctx_value = "order status, order state" - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "fields": [{"name": "status", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "ai_context": ai_ctx_value, - "dimension": {"is_time": False}}]}]} - sm = self._roundtrip(model, tmp_path) - f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") - # String ai_context is merged into description on OSI→Honeydew; value must be recoverable - assert ai_ctx_value in (f.get("description") or "") or f.get("ai_context") == ai_ctx_value - - def test_ai_context_dict_preserved(self, tmp_path): - ctx = {"instructions": "Use for revenue analysis", "synonyms": ["revenue", "sales"]} - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "fields": [{"name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "ai_context": ctx}]}]} - sm = self._roundtrip(model, tmp_path) - f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "total") - assert f.get("ai_context") == ctx - - def test_model_ai_context_preserved(self, tmp_path): - ctx = {"instructions": "Retail analytics", "synonyms": ["store"]} - model = {"name": "m", "ai_context": ctx, "datasets": []} - sm = self._roundtrip(model, tmp_path) - assert sm.get("ai_context") == ctx - - def test_non_honeydew_custom_extensions_preserved(self, tmp_path): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], - "fields": []}]} - sm = self._roundtrip(model, tmp_path) - exts = sm["datasets"][0].get("custom_extensions") or [] - assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) - - def test_relationship_name_preserved(self, tmp_path): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - sm = self._roundtrip(model, tmp_path) - assert sm["relationships"][0]["name"] == "orders_to_customers" - - def test_relationship_columns_preserved(self, tmp_path): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "r", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - sm = self._roundtrip(model, tmp_path) - rel = sm["relationships"][0] - assert rel["from_columns"] == ["cid"] and rel["to_columns"] == ["id"] - - def test_metric_name_and_expression_preserved(self, tmp_path): - model = {"name": "m", - "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], - "metrics": [{"name": "total_revenue", "description": "Sum of sales", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} - sm = self._roundtrip(model, tmp_path) - m = sm["metrics"][0] - assert m["name"] == "total_revenue" - assert m["expression"]["dialects"][0]["expression"] == "SUM(orders.total)" - assert m["description"] == "Sum of sales" - - def test_tpcds_example_roundtrip(self, tmp_path): - tpcds_path = ( - Path(__file__).resolve().parent.parent.parent.parent - / "examples" / "tpcds_semantic_model.yaml" - ) - if not tpcds_path.exists(): - pytest.skip("TPC-DS example not found") - osi_yaml = tpcds_path.read_text() - files = convert_osi_to_honeydew(osi_yaml) - for rel_path, content in files.items(): - p = tmp_path / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - sm = result["semantic_model"][0] - assert sm["name"] == "tpcds_retail_model" - ds_names = {ds["name"] for ds in sm["datasets"]} - assert "store_sales" in ds_names and "customer" in ds_names - - -class TestHoneydewToOsiToHoneydewRoundTrip: - """Honeydew → OSI → Honeydew: verify Honeydew-specific fields survive.""" - - def _roundtrip(self, entities, tmp_path): - _write_workspace(str(tmp_path), "ws", entities) - osi_yaml = convert_honeydew_to_osi(str(tmp_path)) - files = convert_osi_to_honeydew(osi_yaml) - # Write to a second directory - out_dir = tmp_path / "out" - for rel_path, content in files.items(): - p = out_dir / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - return out_dir - - def test_entity_name_and_keys_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["order_id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - }], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - assert entity["name"] == "orders" - assert entity["keys"] == ["order_id"] - - def test_source_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.SCHEMA.ORDERS", "dataset_attrs": [], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - assert ds["sql"] == "DB.SCHEMA.ORDERS" - - def test_column_attributes_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [ - {"column": "o_id", "name": "id", "datatype": "number"}, - {"column": "o_status", "name": "status", "datatype": "string"}, - ], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["id"]["column"] == "o_id" - assert attrs["status"]["datatype"] == "string" - - def test_labels_preserved_on_column(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [ - {"column": "status", "name": "status", "datatype": "string", "labels": ["sales"]}, - ], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "sales" in attrs["status"].get("labels", []) - - def test_calculated_attribute_sql_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "disc", "datatype": "number", - "sql": "orders.price * (1 - orders.discount)"}], - }], tmp_path) - calc = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) - assert calc["sql"] == "orders.price * (1 - orders.discount)" - - def test_metric_entity_assignment_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", - "datatype": "number", "sql": "COUNT(*)"}], - }], tmp_path) - m = yaml.safe_load((out_dir / "schema/orders/metrics/cnt.yml").read_text()) - assert m["entity"] == "orders" - assert m["sql"] == "COUNT(*)" - - def test_relation_preserved(self, tmp_path): - out_dir = self._roundtrip([ - {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection": [{"src_field": "cid", "target_field": "id"}]}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", - "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, - ], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - assert entity["relations"][0]["target_entity"] == "customers" - assert entity["relations"][0]["connection"][0]["src_field"] == "cid" - - def test_bool_datatype_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [ - {"column": "is_active", "name": "is_active", "datatype": "bool"}, - ], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["is_active"]["datatype"] == "bool" - - def test_connection_expr_preserved(self, tmp_path): - out_dir = self._roundtrip([ - {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", - "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, - ], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - rel = entity["relations"][0] - assert rel.get("connection_expr", {}).get("sql") == "orders.cid = customers.id AND orders.region = customers.region" - - def test_dataset_attr_display_name_and_format_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [ - {"column": "status", "name": "status", "datatype": "string", - "display_name": "Order Status", "hidden": True, "format_string": "##,###"}, - ], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["status"]["display_name"] == "Order Status" - assert attrs["status"]["hidden"] is True - assert attrs["status"]["format_string"] == "##,###" - - def test_calc_attr_honeydew_fields_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "disc", "datatype": "number", - "sql": "orders.price * 0.9", - "display_name": "Discounted Price", - "timegrain": "day"}], - }], tmp_path) - calc = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) - assert calc["display_name"] == "Discounted Price" - assert calc["timegrain"] == "day" - - def test_entity_owner_and_display_name_preserved(self, tmp_path): - ws_path = tmp_path / "workspace.yml" - ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) - base = tmp_path / "schema" / "orders" - (base / "datasets").mkdir(parents=True) - (base / "orders.yml").write_text(yaml.dump({ - "type": "entity", "name": "orders", "keys": ["id"], - "key_dataset": "orders", "relations": [], - "owner": "analytics_team", "display_name": "Orders Table", - })) - (base / "datasets" / "orders.yml").write_text(yaml.dump({ - "type": "dataset", "entity": "orders", "name": "orders", - "sql": "DB.S.ORDERS", "dataset_type": "table", "attributes": [], - })) - osi_yaml = convert_honeydew_to_osi(str(tmp_path)) - files = convert_osi_to_honeydew(osi_yaml) - out_dir = tmp_path / "out" - for rel_path, content in files.items(): - p = out_dir / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - assert entity.get("owner") == "analytics_team" - assert entity.get("display_name") == "Orders Table" - - def test_calc_attr_with_simple_identifier_sql_preserved(self, tmp_path): - out_dir = self._roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "revenue", "datatype": "number", "sql": "revenue"}], - }], tmp_path) - # sql='revenue' is a simple identifier — must still come back as calculated_attribute - calc_path = out_dir / "schema/orders/attributes/revenue.yml" - assert calc_path.exists(), "calculated_attribute with simple-id sql should not become a dataset column" - calc = yaml.safe_load(calc_path.read_text()) - assert calc["sql"] == "revenue" +def test_honeydew_roundtrip_entity_name_and_keys(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["order_id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + }], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + assert entity["name"] == "orders" and entity["keys"] == ["order_id"] + + +def test_honeydew_roundtrip_source(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.SCHEMA.ORDERS", "dataset_attrs": [], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + assert ds["sql"] == "DB.SCHEMA.ORDERS" + + +def test_honeydew_roundtrip_column_attributes(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "o_id", "name": "id", "datatype": "number"}, + {"column": "o_status", "name": "status", "datatype": "string"}, + ], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["id"]["column"] == "o_id" + assert attrs["status"]["datatype"] == "string" + + +def test_honeydew_roundtrip_labels_on_column(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", "labels": ["sales"]}], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert "sales" in attrs["status"].get("labels", []) + + +def test_honeydew_roundtrip_calculated_attribute_sql(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}], + }], tmp_path) + calc = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) + assert calc["sql"] == "orders.price * (1 - orders.discount)" + + +def test_honeydew_roundtrip_metric_entity_assignment(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}], + }], tmp_path) + m = yaml.safe_load((out_dir / "schema/orders/metrics/cnt.yml").read_text()) + assert m["entity"] == "orders" and m["sql"] == "COUNT(*)" + + +def test_honeydew_roundtrip_relation(tmp_path): + out_dir = _honeydew_roundtrip([ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + assert entity["relations"][0]["target_entity"] == "customers" + assert entity["relations"][0]["connection"][0]["src_field"] == "cid" + + +def test_honeydew_roundtrip_bool_datatype(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "is_active", "name": "is_active", "datatype": "bool"}], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["is_active"]["datatype"] == "bool" + + +def test_honeydew_roundtrip_connection_expr(tmp_path): + out_dir = _honeydew_roundtrip([ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + rel = entity["relations"][0] + assert rel.get("connection_expr", {}).get("sql") == "orders.cid = customers.id AND orders.region = customers.region" + + +@pytest.mark.parametrize("attr_extra,check_key,check_val", [ + ({"display_name": "Order Status"}, "display_name", "Order Status"), + ({"hidden": True}, "hidden", True), + ({"format_string": "##,###"}, "format_string", "##,###"), +]) +def test_honeydew_roundtrip_dataset_attr_honeydew_field(tmp_path, attr_extra, check_key, check_val): + attr = {"column": "status", "name": "status", "datatype": "string", **attr_extra} + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [attr], + }], tmp_path) + ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) + attrs = {a["name"]: a for a in ds["attributes"]} + assert attrs["status"][check_key] == check_val + + +@pytest.mark.parametrize("calc_extra,check_key,check_val", [ + ({"display_name": "Discounted Price"}, "display_name", "Discounted Price"), + ({"timegrain": "day"}, "timegrain", "day"), +]) +def test_honeydew_roundtrip_calc_attr_honeydew_field(tmp_path, calc_extra, check_key, check_val): + calc = {"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", "sql": "orders.price * 0.9", **calc_extra} + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], "calc_attrs": [calc], + }], tmp_path) + result = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) + assert result[check_key] == check_val + + +@pytest.mark.parametrize("entity_extra,check_key,check_val", [ + ({"owner": "analytics_team"}, "owner", "analytics_team"), + ({"display_name": "Orders Table"}, "display_name", "Orders Table"), + ({"hidden": True}, "hidden", True), + ({"folder": "finance"}, "folder", "finance"), +]) +def test_honeydew_roundtrip_entity_honeydew_field(tmp_path, entity_extra, check_key, check_val): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], **entity_extra, + }], tmp_path) + entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) + assert entity.get(check_key) == check_val + + +def test_honeydew_roundtrip_calc_attr_simple_identifier_stays_calc(tmp_path): + out_dir = _honeydew_roundtrip([{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "revenue", "datatype": "number", "sql": "revenue"}], + }], tmp_path) + calc_path = out_dir / "schema/orders/attributes/revenue.yml" + assert calc_path.exists(), "calculated_attribute with simple-id sql should not become a dataset column" + calc = yaml.safe_load(calc_path.read_text()) + assert calc["sql"] == "revenue" # ───────────────────────────────────────────────────────────────────────────── # Bug-fix regression tests # ───────────────────────────────────────────────────────────────────────────── -class TestBugFixes: - def test_empty_string_expression_skipped(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "bad", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}, - "dimension": {"is_time": False}, - }]}]} +@pytest.mark.parametrize("expression", [ + {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}, + {"dialects": [{"dialect": "ANSI_SQL", "expression": " "}]}, +]) +def test_empty_or_whitespace_field_expression_skipped(expression): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": expression, + "dimension": {"is_time": False}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert all(a["name"] != "bad" for a in ds["attributes"]) + assert "schema/orders/attributes/bad.yml" not in files + + +@pytest.mark.parametrize("expression", [ + {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}, + {"dialects": [{"dialect": "ANSI_SQL", "expression": " "}]}, +]) +def test_empty_or_whitespace_metric_expression_skipped(expression): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "bad_m", "expression": expression}]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/orders/metrics/bad_m.yml" not in files + + +def test_non_dict_expression_warns(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": "just_a_string", + "dimension": {"is_time": False}, + }]}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - names = [a["name"] for a in ds["attributes"]] - assert "bad" not in names - assert "schema/orders/attributes/bad.yml" not in files - - def test_duplicate_metric_name_warns(self): - model = {"name": "m", - "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], - "metrics": [ - {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.a)"}]}}, - {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.b)"}]}}, - ]} - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - files = convert_osi_to_honeydew(_osi(model)) - assert any("total" in str(x.message) for x in w) - # Last definition wins - m = yaml.safe_load(files["schema/orders/metrics/total.yml"]) - assert "orders.b" in m["sql"] - - def test_metric_string_ai_context_preserved_in_roundtrip(self, tmp_path): - model = {"name": "m", - "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], - "metrics": [{"name": "rev", "ai_context": "Use for revenue analysis", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} - files = convert_osi_to_honeydew(_osi(model)) - for rel_path, content in files.items(): - p = tmp_path / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - m = result["semantic_model"][0]["metrics"][0] - assert m.get("ai_context") == "Use for revenue analysis" - - def test_whitespace_expression_skipped(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "bad", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": " "}]}, - "dimension": {"is_time": False}, - }]}]} + assert any("must be a mapping" in str(x.message) for x in w) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert all(a["name"] != "bad" for a in ds["attributes"]) + + +def test_duplicate_metric_name_warns(): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [ + {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.a)"}]}}, + {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.b)"}]}}, + ]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - names = [a["name"] for a in ds["attributes"]] - assert "bad" not in names - assert "schema/orders/attributes/bad.yml" not in files - - def test_non_dict_expression_warns(self): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "bad", - "expression": "just_a_string", - "dimension": {"is_time": False}, - }]}]} - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - files = convert_osi_to_honeydew(_osi(model)) - assert any("must be a mapping" in str(x.message) for x in w) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert all(a["name"] != "bad" for a in ds["attributes"]) - - def test_malformed_osi_metadata_json_warns(self, tmp_path): - ws_path = tmp_path / "workspace.yml" - ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) - base = tmp_path / "schema" / "orders" - (base / "datasets").mkdir(parents=True) - entity = { - "type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", - "relations": [], - "metadata": [{"name": "osi", "metadata": [ - {"name": "unique_keys", "value": "[broken json"}, - ]}], - } - (base / "orders.yml").write_text(yaml.dump(entity)) - (base / "datasets" / "orders.yml").write_text(yaml.dump( - {"type": "dataset", "entity": "orders", "name": "orders", - "sql": "DB.S.ORDERS", "dataset_type": "table", "attributes": []} - )) - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - convert_honeydew_to_osi(str(tmp_path)) - assert any("unique_keys" in str(x.message) for x in w) + assert any("total" in str(x.message) for x in w) + m = yaml.safe_load(files["schema/orders/metrics/total.yml"]) + assert "orders.b" in m["sql"] + + +def test_metric_string_ai_context_preserved_in_roundtrip(tmp_path): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "rev", "ai_context": "Use for revenue analysis", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + files = convert_osi_to_honeydew(_osi(model)) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + m = result["semantic_model"][0]["metrics"][0] + assert m.get("ai_context") == "Use for revenue analysis" + + +def test_malformed_osi_metadata_json_warns(tmp_path): + ws_path = tmp_path / "workspace.yml" + ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) + base = tmp_path / "schema" / "orders" + (base / "datasets").mkdir(parents=True) + entity = { + "type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "unique_keys", "value": "[broken json"}, + ]}], + } + (base / "orders.yml").write_text(yaml.dump(entity)) + (base / "datasets" / "orders.yml").write_text(yaml.dump( + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", "attributes": []} + )) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + convert_honeydew_to_osi(str(tmp_path)) + assert any("unique_keys" in str(x.message) for x in w) From 35b1f1b83d3714a75a75b3033572afebed7d97f5 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 18:34:19 +0300 Subject: [PATCH 06/13] Five code quality improvements to the Honeydew converter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove requirements.txt (superseded by pyproject.toml) - Extract _fields_to_honeydew() from _dataset_to_files(): field classification is now a named, independently testable function; four direct unit tests added - Warn when a relationship has neither from_columns nor connection_expr so callers discover the incomplete join before it reaches Honeydew - Round-trip the vendors list: non-HONEYDEW vendors are stored in the workspace osi metadata on OSI→Honeydew and merged back on the return trip (HONEYDEW always appears first) - Add main() CLI smoke tests via subprocess: osi-to-honeydew writes the expected workspace.yml, honeydew-to-osi writes a parseable OSI YAML, and path traversal in an entity name is rejected with exit code 1 - Update README setup instructions to use pip install . / pip install -e . Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/README.md | 4 +- converters/honeydew/requirements.txt | 2 - .../honeydew/src/honeydew_osi_converter.py | 139 ++++++++++------- .../tests/test_honeydew_osi_converter.py | 147 ++++++++++++++++++ 4 files changed, 232 insertions(+), 60 deletions(-) delete mode 100644 converters/honeydew/requirements.txt diff --git a/converters/honeydew/README.md b/converters/honeydew/README.md index 103d19a..62e1e8b 100644 --- a/converters/honeydew/README.md +++ b/converters/honeydew/README.md @@ -43,7 +43,9 @@ Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models a ## Setup ```bash -pip install -r requirements.txt +pip install . +# or in editable mode for development: +pip install -e . ``` ## Usage diff --git a/converters/honeydew/requirements.txt b/converters/honeydew/requirements.txt deleted file mode 100644 index 2f29b5b..0000000 --- a/converters/honeydew/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -PyYAML>=5.0 -pytest>=7.0 diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index e082360..3857bb7 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -85,10 +85,11 @@ def convert_osi_to_honeydew(osi_yaml_str: str) -> dict[str, str]: "only the first will be converted" ) - return _model_to_files(semantic_models[0]) + vendors = [v for v in (root.get("vendors") or []) if v != HONEYDEW_VENDOR] + return _model_to_files(semantic_models[0], extra_vendors=vendors) -def _model_to_files(sm: dict[str, Any]) -> dict[str, str]: +def _model_to_files(sm: dict[str, Any], *, extra_vendors: list[str] | None = None) -> dict[str, str]: name = sm.get("name") if not name: raise HoneydewConversionError("Missing 'name' in semantic model") @@ -99,10 +100,14 @@ def _model_to_files(sm: dict[str, Any]) -> dict[str, str]: if sm.get("description"): workspace["description"] = sm["description"] - # Preserve model-level ai_context and non-HONEYDEW custom_extensions + # Preserve model-level ai_context, non-HONEYDEW custom_extensions, and extra vendors model_ai_ctx = sm.get("ai_context") model_ext = [e for e in (sm.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] - ws_meta = _build_osi_metadata(ai_context=model_ai_ctx, custom_extensions=model_ext or None) + ws_meta = _build_osi_metadata( + ai_context=model_ai_ctx, + custom_extensions=model_ext or None, + extra_vendors=extra_vendors or None, + ) if ws_meta: workspace["metadata"] = [ws_meta] @@ -139,57 +144,11 @@ def _model_to_files(sm: dict[str, Any]) -> dict[str, str]: return files -def _dataset_to_files( - ds: dict[str, Any], - relations: list[dict[str, Any]], - metrics: list[dict[str, Any]], -) -> dict[str, str]: - entity_name = ds["name"] - base = f"schema/{entity_name}" - files: dict[str, str] = {} - - primary_key = ds.get("primary_key") or [] - unique_keys = ds.get("unique_keys") - description = ds.get("description") - ai_context = ds.get("ai_context") - fields = ds.get("fields") or [] - ds_ext = [e for e in (ds.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] - - # ── entity YAML ──────────────────────────────────────────────────────────── - entity_dict: dict[str, Any] = {"type": "entity", "name": entity_name} - if description: - entity_dict["description"] = description - if primary_key: - entity_dict["keys"] = list(primary_key) - entity_dict["key_dataset"] = entity_name - - # Restore Honeydew-specific entity fields from HONEYDEW custom_extension - entity_hd_hint = _get_honeydew_extension(ds) - for key in ("owner", "display_name", "hidden", "folder"): - if key in entity_hd_hint: - entity_dict[key] = entity_hd_hint[key] - if "labels" in entity_hd_hint: - entity_dict["labels"] = entity_hd_hint["labels"] - - honeydew_relations = [] - for rel in relations: - hr = _osi_relation_to_honeydew(rel) - if hr is not None: - honeydew_relations.append(hr) - entity_dict["relations"] = honeydew_relations - - # Preserve OSI fields that have no Honeydew native equivalent - entity_meta = _build_osi_metadata( - ai_context=ai_context, - unique_keys=unique_keys, - custom_extensions=ds_ext or None, - ) - if entity_meta: - entity_dict["metadata"] = [entity_meta] - - files[f"{base}/{entity_name}.yml"] = _dump(entity_dict) - - # ── classify fields into dataset attributes vs calculated attributes ──────── +def _fields_to_honeydew( + fields: list[dict[str, Any]], + entity_name: str, +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Classify OSI fields into Honeydew dataset attributes and calculated attributes.""" dataset_attrs: list[dict[str, Any]] = [] calc_attrs: list[dict[str, Any]] = [] @@ -264,6 +223,62 @@ def _dataset_to_files( calc[k] = hd_hint[k] calc_attrs.append(calc) + return dataset_attrs, calc_attrs + + +def _dataset_to_files( + ds: dict[str, Any], + relations: list[dict[str, Any]], + metrics: list[dict[str, Any]], +) -> dict[str, str]: + entity_name = ds["name"] + base = f"schema/{entity_name}" + files: dict[str, str] = {} + + primary_key = ds.get("primary_key") or [] + unique_keys = ds.get("unique_keys") + description = ds.get("description") + ai_context = ds.get("ai_context") + fields = ds.get("fields") or [] + ds_ext = [e for e in (ds.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + + # ── entity YAML ──────────────────────────────────────────────────────────── + entity_dict: dict[str, Any] = {"type": "entity", "name": entity_name} + if description: + entity_dict["description"] = description + if primary_key: + entity_dict["keys"] = list(primary_key) + entity_dict["key_dataset"] = entity_name + + # Restore Honeydew-specific entity fields from HONEYDEW custom_extension + entity_hd_hint = _get_honeydew_extension(ds) + for key in ("owner", "display_name", "hidden", "folder"): + if key in entity_hd_hint: + entity_dict[key] = entity_hd_hint[key] + if "labels" in entity_hd_hint: + entity_dict["labels"] = entity_hd_hint["labels"] + + honeydew_relations = [] + for rel in relations: + hr = _osi_relation_to_honeydew(rel) + if hr is not None: + honeydew_relations.append(hr) + entity_dict["relations"] = honeydew_relations + + # Preserve OSI fields that have no Honeydew native equivalent + entity_meta = _build_osi_metadata( + ai_context=ai_context, + unique_keys=unique_keys, + custom_extensions=ds_ext or None, + ) + if entity_meta: + entity_dict["metadata"] = [entity_meta] + + files[f"{base}/{entity_name}.yml"] = _dump(entity_dict) + + # ── classify fields into dataset attributes vs calculated attributes ──────── + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, entity_name) + # ── dataset YAML ─────────────────────────────────────────────────────────── source_sql, dataset_type = _parse_osi_source(ds.get("source", "")) dataset_dict: dict[str, Any] = { @@ -356,6 +371,11 @@ def _osi_relation_to_honeydew(rel: dict[str, Any]) -> dict[str, Any] | None: hd_ext = _get_honeydew_extension(rel) if hd_ext.get("connection_expr"): honeydew_rel["connection_expr"] = {"sql": hd_ext["connection_expr"]} + else: + warnings.warn( + f"Relationship '{rel_name}' has no from_columns and no connection_expr; " + "Honeydew will not be able to resolve the join" + ) return honeydew_rel @@ -556,9 +576,11 @@ def convert_honeydew_to_osi(workspace_dir: str) -> str: if osi_metrics: sm["metrics"] = osi_metrics + extra_vendors = ws_osi_meta.get("vendors") or [] + vendors = [HONEYDEW_VENDOR] + [v for v in extra_vendors if v != HONEYDEW_VENDOR] root: dict[str, Any] = { "version": SUPPORTED_OSI_VERSION, - "vendors": [HONEYDEW_VENDOR], + "vendors": vendors, "semantic_model": [sm], } return _dump(root) @@ -860,6 +882,7 @@ def _build_osi_metadata( ai_context: Any = None, unique_keys: Any = None, custom_extensions: list | None = None, + extra_vendors: list[str] | None = None, ) -> dict[str, Any] | None: """Build a Honeydew metadata entry that stores OSI-only fields for round-tripping.""" items: list[dict[str, Any]] = [] @@ -871,6 +894,8 @@ def _build_osi_metadata( items.append({"name": "unique_keys", "value": json.dumps(unique_keys)}) if custom_extensions: items.append({"name": "custom_extensions", "value": json.dumps(custom_extensions)}) + if extra_vendors: + items.append({"name": "vendors", "value": json.dumps(extra_vendors)}) if not items: return None @@ -891,7 +916,7 @@ def _read_osi_metadata(obj: dict[str, Any]) -> dict[str, Any]: result[key] = json.loads(raw) except (json.JSONDecodeError, TypeError): result[key] = raw - elif key in ("unique_keys", "custom_extensions"): + elif key in ("unique_keys", "custom_extensions", "vendors"): try: result[key] = json.loads(raw) except (json.JSONDecodeError, TypeError): diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index 810661e..f46fb2f 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -15,6 +15,7 @@ _assign_metrics_to_entities, _build_osi_metadata, _check_safe_path, + _fields_to_honeydew, _find_entity_in_expression, _honeydew_datatype_to_osi_dimension, _is_simple_identifier, @@ -1106,3 +1107,149 @@ def test_malformed_osi_metadata_json_warns(tmp_path): warnings.simplefilter("always") convert_honeydew_to_osi(str(tmp_path)) assert any("unique_keys" in str(x.message) for x in w) + + +# ───────────────────────────────────────────────────────────────────────────── +# _fields_to_honeydew unit tests +# ───────────────────────────────────────────────────────────────────────────── + +def test_fields_to_honeydew_simple_identifier_goes_to_dataset(): + fields = [{"name": "status", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}}] + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") + assert len(dataset_attrs) == 1 and len(calc_attrs) == 0 + assert dataset_attrs[0]["column"] == "status" + + +def test_fields_to_honeydew_complex_sql_goes_to_calc(): + fields = [{"name": "disc", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * 0.9"}]}}] + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") + assert len(dataset_attrs) == 0 and len(calc_attrs) == 1 + assert calc_attrs[0]["sql"] == "price * 0.9" + + +def test_fields_to_honeydew_missing_name_raises(): + fields = [{"expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "col"}]}}] + with pytest.raises(HoneydewConversionError, match="missing 'name'"): + _fields_to_honeydew(fields, "orders") + + +def test_fields_to_honeydew_empty_expression_skipped(): + fields = [{"name": "bad", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}}] + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") + assert dataset_attrs == [] and calc_attrs == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# Connectionless relation warning +# ───────────────────────────────────────────────────────────────────────────── + +def test_connectionless_relation_warns(): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers"}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(_osi(model)) + assert any("resolve the join" in str(x.message) for x in w) + entity = yaml.safe_load(files["schema/orders/orders.yml"]) + assert entity["relations"][0]["target_entity"] == "customers" + + +# ───────────────────────────────────────────────────────────────────────────── +# vendors round-trip +# ───────────────────────────────────────────────────────────────────────────── + +def test_vendors_roundtrip_preserves_non_honeydew(tmp_path): + doc = yaml.dump({ + "version": OSI_VERSION, + "vendors": ["SNOWFLAKE", "HONEYDEW"], + "semantic_model": [{"name": "m", "datasets": []}], + }) + files = convert_osi_to_honeydew(doc) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert "SNOWFLAKE" in result["vendors"] + assert "HONEYDEW" in result["vendors"] + + +def test_vendors_always_includes_honeydew(tmp_path): + doc = yaml.dump({ + "version": OSI_VERSION, + "vendors": ["SNOWFLAKE"], + "semantic_model": [{"name": "m", "datasets": []}], + }) + files = convert_osi_to_honeydew(doc) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert result["vendors"][0] == "HONEYDEW" + + +# ───────────────────────────────────────────────────────────────────────────── +# main() CLI smoke tests +# ───────────────────────────────────────────────────────────────────────────── + +def test_main_osi_to_honeydew(tmp_path): + import subprocess, sys + input_file = tmp_path / "model.yaml" + input_file.write_text(yaml.dump({ + "version": OSI_VERSION, + "semantic_model": [{"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []} + ]}], + })) + output_dir = tmp_path / "out" + result = subprocess.run( + [sys.executable, str(Path(__file__).resolve().parent.parent / "src" / "honeydew_osi_converter.py"), + "osi-to-honeydew", "-i", str(input_file), "-o", str(output_dir)], + capture_output=True, text=True, + ) + assert result.returncode == 0 + assert (output_dir / "workspace.yml").exists() + ws = yaml.safe_load((output_dir / "workspace.yml").read_text()) + assert ws["name"] == "m" + + +def test_main_honeydew_to_osi(tmp_path): + import subprocess, sys + _write_workspace(str(tmp_path), "ws", [{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + }]) + output_file = tmp_path / "output.yaml" + result = subprocess.run( + [sys.executable, str(Path(__file__).resolve().parent.parent / "src" / "honeydew_osi_converter.py"), + "honeydew-to-osi", "-i", str(tmp_path), "-o", str(output_file)], + capture_output=True, text=True, + ) + assert result.returncode == 0 + assert output_file.exists() + doc = yaml.safe_load(output_file.read_text()) + assert doc["semantic_model"][0]["name"] == "ws" + + +def test_main_path_traversal_rejected(tmp_path): + import subprocess, sys + # Entity name containing traversal sequences generates paths that escape output_dir + input_file = tmp_path / "model.yaml" + input_file.write_text( + f"version: '{OSI_VERSION}'\nsemantic_model:\n" + " - name: m\n datasets:\n" + " - name: '../../evil'\n source: db.s.evil\n fields: []\n" + ) + output_dir = tmp_path / "out" + result = subprocess.run( + [sys.executable, str(Path(__file__).resolve().parent.parent / "src" / "honeydew_osi_converter.py"), + "osi-to-honeydew", "-i", str(input_file), "-o", str(output_dir)], + capture_output=True, text=True, + ) + assert result.returncode == 1 + assert "refusing to write" in result.stderr + assert not (tmp_path / "evil.yml").exists() From 58495559a6c2407cd1572ad87dcf371a43905055 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 18:56:27 +0300 Subject: [PATCH 07/13] Rewrite tests to be fully parametrized with complete output verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapse 13 individual OSI→Honeydew tests, 7 Honeydew→OSI tests, 10 OSI round-trip tests, and 13 Honeydew round-trip tests into four @pytest.mark.parametrize blocks. Every assertion now compares the entire output dict rather than cherry-picked fields. Co-Authored-By: Claude Sonnet 4.6 --- .../tests/test_honeydew_osi_converter.py | 1442 ++++++++++------- 1 file changed, 815 insertions(+), 627 deletions(-) diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index f46fb2f..46e1e7a 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -323,183 +323,227 @@ def test_check_safe_path(rel_path, expected): # ───────────────────────────────────────────────────────────────────────────── -# OSI → Honeydew integration tests +# OSI → Honeydew: file content # ───────────────────────────────────────────────────────────────────────────── -def test_osi_to_honeydew_workspace_yml(): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ws = yaml.safe_load(files["workspace.yml"]) - assert ws["name"] == "test_model" and ws["type"] == "workspace" - - -def test_osi_to_honeydew_entity_yml(): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - assert entity["name"] == "orders" - assert entity["keys"] == ["order_id"] - assert entity["key_dataset"] == "orders" - - -def test_osi_to_honeydew_dataset_yml(): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert ds["sql"] == "db.schema.orders" - assert ds["dataset_type"] == "table" - - -def test_osi_to_honeydew_simple_fields_become_dataset_attributes(): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - names = [a["name"] for a in ds["attributes"]] - assert "order_id" in names and "order_date" in names and "total" in names - - -@pytest.mark.parametrize("field_name,expected_dt", [ - ("order_date", "timestamp"), - ("total", "number"), -]) -def test_osi_to_honeydew_field_datatypes(field_name, expected_dt): - files = convert_osi_to_honeydew(_osi(_minimal_model())) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs[field_name]["datatype"] == expected_dt - - -def test_osi_to_honeydew_complex_expression_becomes_calculated_attribute(): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "disc_price", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, - "dimension": {"is_time": False}, - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - assert "schema/orders/attributes/disc_price.yml" in files - calc = yaml.safe_load(files["schema/orders/attributes/disc_price.yml"]) - assert calc["type"] == "calculated_attribute" - assert calc["sql"] == "price * (1 - discount)" - - -def test_osi_to_honeydew_label_mapped_to_honeydew_labels(): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "status", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "dimension": {"is_time": False}, - "label": "sales", - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "sales" in attrs["status"]["labels"] - - -def test_osi_to_honeydew_ai_context_string_merged_into_description(): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "description": "Base desc", - "ai_context": "revenue, earnings", - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "revenue, earnings" in attrs["total"]["description"] - - -def test_osi_to_honeydew_ai_context_dict_instructions_merged_into_description(): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}, - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "Use for revenue" in attrs["total"]["description"] - assert "rev" in attrs["total"]["labels"] - - -def test_osi_to_honeydew_ai_context_dict_stored_in_metadata(): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ - "name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev"]}, - }]}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - attr = next(a for a in ds["attributes"] if a["name"] == "total") - osi_section = next((s for s in attr.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - assert any(i["name"] == "ai_context" for i in osi_section["metadata"]) - - -def test_osi_to_honeydew_unique_keys_stored_in_entity_metadata(): - model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", - "primary_key": ["item_id"], - "unique_keys": [["sku"], ["item_id", "variant"]], - "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/items/items.yml"]) - osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - uk_item = next((i for i in osi_section["metadata"] if i["name"] == "unique_keys"), None) - assert uk_item is not None - assert json.loads(uk_item["value"]) == [["sku"], ["item_id", "variant"]] - - -def test_osi_to_honeydew_non_honeydew_extensions_stored_in_metadata(): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], - "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - osi_section = next((s for s in entity.get("metadata", []) if s["name"] == "osi"), None) - assert osi_section is not None - ext_item = next((i for i in osi_section["metadata"] if i["name"] == "custom_extensions"), None) - assert ext_item is not None - exts = json.loads(ext_item["value"]) - assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) - - -def test_osi_to_honeydew_relationship_name_stored_in_relation(): - model = {"name": "m", "datasets": [ +_REL_MODEL = { + "name": "m", + "datasets": [ {"name": "orders", "source": "db.s.orders", "fields": []}, {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - assert entity["relations"][0]["name"] == "orders_to_customers" - - -def test_osi_to_honeydew_model_ai_context_stored_in_workspace_metadata(): - model = {"name": "m", "datasets": [], - "ai_context": {"instructions": "Use for retail analytics", "synonyms": ["store"]}} - files = convert_osi_to_honeydew(_osi(model)) - ws = yaml.safe_load(files["workspace.yml"]) - assert any(s["name"] == "osi" for s in ws.get("metadata", [])) - - -def test_osi_to_honeydew_relationship_on_from_entity_only(): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "r", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - files = convert_osi_to_honeydew(_osi(model)) - orders = yaml.safe_load(files["schema/orders/orders.yml"]) - customers = yaml.safe_load(files["schema/customers/customers.yml"]) - assert len(orders["relations"]) == 1 - assert customers["relations"] == [] - rel = orders["relations"][0] - assert rel["target_entity"] == "customers" and rel["rel_type"] == "many-to-one" - assert rel["connection"] == [{"src_field": "cid", "target_field": "id"}] - - -def test_osi_to_honeydew_metric_assigned_by_expression_entity(): - model = {"name": "m", - "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], - "metrics": [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + ], + "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}], +} + +@pytest.mark.parametrize("model,path,expected", [ + # ── minimal model ────────────────────────────────────────────────────────── + pytest.param( + _minimal_model(), + "workspace.yml", + {"type": "workspace", "name": "test_model"}, + id="minimal-workspace", + ), + pytest.param( + _minimal_model(), + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["order_id"], + "key_dataset": "orders", "relations": []}, + id="minimal-entity", + ), + pytest.param( + _minimal_model(), + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.schema.orders", "dataset_type": "table", + "attributes": [ + {"column": "order_id", "name": "order_id", "datatype": "string"}, + {"column": "order_date", "name": "order_date", "datatype": "timestamp"}, + {"column": "total_amount", "name": "total", "datatype": "number"}, + ], + }, + id="minimal-dataset", + ), + # ── complex expression → calculated attribute ────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "disc_price", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, + "dimension": {"is_time": False}, + }]}]}, + "schema/orders/attributes/disc_price.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc_price", + "datatype": "string", "sql": "price * (1 - discount)"}, + id="calc-attr-file", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "disc_price", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, + "dimension": {"is_time": False}, + }]}]}, + "schema/orders/datasets/orders.yml", + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", "attributes": []}, + id="calc-attr-dataset-empty", + ), + # ── label → labels in attr ──────────────────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}, + "label": "sales", + }]}]}, + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", + "datatype": "string", "labels": ["sales"]}], + }, + id="label-in-attr", + ), + # ── ai_context string → description merged ──────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "description": "Base desc", + "ai_context": "revenue, earnings", + }]}]}, + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", + "attributes": [{"column": "total", "name": "total", "datatype": "number", + "description": "Base desc\nrevenue, earnings"}], + }, + id="ai-context-string", + ), + # ── ai_context dict → labels + description + metadata ──────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}, + }]}]}, + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", + "attributes": [{ + "column": "total", "name": "total", "datatype": "number", + "description": "Use for revenue", + "labels": ["rev", "earnings"], + "metadata": [{"name": "osi", "metadata": [ + {"name": "ai_context", + "value": '{"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}'}, + ]}], + }], + }, + id="ai-context-dict", + ), + # ── unique_keys → entity metadata ───────────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["item_id"], + "unique_keys": [["sku"], ["item_id", "variant"]], + "fields": []}]}, + "schema/items/items.yml", + { + "type": "entity", "name": "items", "keys": ["item_id"], + "key_dataset": "items", "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "unique_keys", "value": '[["sku"], ["item_id", "variant"]]'}, + ]}], + }, + id="unique-keys", + ), + # ── non-HONEYDEW custom_extensions → entity metadata ────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]}, + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "key_dataset": "orders", "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "custom_extensions", + "value": '[{"vendor_name": "SNOWFLAKE", "data": "{\\"warehouse\\": \\"WH\\"}"}]'}, + ]}], + }, + id="custom-ext", + ), + # ── relationship on from-entity; nothing on to-entity ──────────────────── + pytest.param( + _REL_MODEL, + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "name": "orders_to_customers", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + }, + id="relation-from-entity", + ), + pytest.param( + _REL_MODEL, + "schema/customers/customers.yml", + {"type": "entity", "name": "customers", "key_dataset": "customers", "relations": []}, + id="relation-to-entity-empty", + ), + # ── model-level ai_context → workspace metadata ─────────────────────────── + pytest.param( + {"name": "m", "datasets": [], + "ai_context": {"instructions": "Use for retail analytics", "synonyms": ["store"]}}, + "workspace.yml", + { + "type": "workspace", "name": "m", + "metadata": [{"name": "osi", "metadata": [ + {"name": "ai_context", + "value": '{"instructions": "Use for retail analytics", "synonyms": ["store"]}'}, + ]}], + }, + id="model-ai-context", + ), + # ── metric ──────────────────────────────────────────────────────────────── + pytest.param( + {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total_rev", "description": "Sum of sales", + "expression": {"dialects": [{"dialect": "ANSI_SQL", + "expression": "SUM(orders.total)"}]}}]}, + "schema/orders/metrics/total_rev.yml", + {"type": "metric", "entity": "orders", "name": "total_rev", + "datatype": "number", "sql": "SUM(orders.total)", "description": "Sum of sales"}, + id="metric", + ), + # ── subquery source → dataset_type sql ─────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", + "source": "SELECT * FROM raw.orders WHERE active = true", "fields": []}]}, + "schema/orders/datasets/orders.yml", + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "SELECT * FROM raw.orders WHERE active = true", + "dataset_type": "sql", "attributes": []}, + id="subquery-source", + ), + # ── composite primary key ───────────────────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", + "primary_key": ["order_id", "line_number"], "fields": []}]}, + "schema/li/li.yml", + {"type": "entity", "name": "li", "keys": ["order_id", "line_number"], + "key_dataset": "li", "relations": []}, + id="composite-pk", + ), +]) +def test_osi_to_honeydew_file_content(model, path, expected): files = convert_osi_to_honeydew(_osi(model)) - assert "schema/orders/metrics/total.yml" in files + assert path in files + assert yaml.safe_load(files[path]) == expected def test_osi_to_honeydew_metric_entity_hint_overrides_expression(): @@ -528,22 +572,6 @@ def test_osi_to_honeydew_missing_semantic_model_raises(): convert_osi_to_honeydew(f"version: '{OSI_VERSION}'\n") -def test_osi_to_honeydew_subquery_source_uses_sql_type(): - model = {"name": "m", "datasets": [{"name": "orders", - "source": "SELECT * FROM raw.orders WHERE active = true", "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert ds["dataset_type"] == "sql" - - -def test_osi_to_honeydew_composite_primary_key(): - model = {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", - "primary_key": ["order_id", "line_number"], "fields": []}]} - files = convert_osi_to_honeydew(_osi(model)) - entity = yaml.safe_load(files["schema/li/li.yml"]) - assert entity["keys"] == ["order_id", "line_number"] - - def test_osi_to_honeydew_multiple_models_warns(): doc = yaml.dump({"version": OSI_VERSION, "semantic_model": [ {"name": "m1", "datasets": []}, @@ -553,82 +581,207 @@ def test_osi_to_honeydew_multiple_models_warns(): warnings.simplefilter("always") files = convert_osi_to_honeydew(doc) assert any("only the first" in str(x.message) for x in w) - assert yaml.safe_load(files["workspace.yml"])["name"] == "m1" + assert yaml.safe_load(files["workspace.yml"]) == {"type": "workspace", "name": "m1"} # ───────────────────────────────────────────────────────────────────────────── -# Honeydew → OSI integration tests +# Honeydew → OSI: full document # ───────────────────────────────────────────────────────────────────────────── -def test_honeydew_to_osi_basic(tmp_path): - _write_workspace(str(tmp_path), "tpch", [{ - "name": "orders", "keys": ["orderkey"], "key_dataset": "tpch_orders", - "sql": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", - "dataset_attrs": [ - {"column": "o_orderkey", "name": "orderkey", "datatype": "number"}, - {"column": "o_orderdate", "name": "orderdate", "datatype": "date"}, - ], - }]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - sm = result["semantic_model"][0] - assert sm["name"] == "tpch" - ds = sm["datasets"][0] - assert ds["source"] == "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS" - assert ds["primary_key"] == ["orderkey"] +def _hd_root(sm): + return {"version": OSI_VERSION, "vendors": ["HONEYDEW"], "semantic_model": [sm]} + + +def _ansi(expr): + return {"dialects": [{"dialect": "ANSI_SQL", "expression": expr}]} -@pytest.mark.parametrize("col_name,datatype,expected_dim", [ - ("id", "number", None), - ("status", "string", {"is_time": False}), - ("created_at", "timestamp", {"is_time": True}), +@pytest.mark.parametrize("ws_name,entities,expected_root", [ + # ── basic entity with two dataset attributes ────────────────────────────── + pytest.param( + "tpch", + [{"name": "orders", "keys": ["orderkey"], "key_dataset": "tpch_orders", + "sql": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", + "dataset_attrs": [ + {"column": "o_orderkey", "name": "orderkey", "datatype": "number"}, + {"column": "o_orderdate", "name": "orderdate", "datatype": "date"}, + ]}], + _hd_root({ + "name": "tpch", + "datasets": [{ + "name": "orders", + "source": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", + "primary_key": ["orderkey"], + "fields": [ + {"name": "orderkey", "expression": _ansi("o_orderkey")}, + {"name": "orderdate", "expression": _ansi("o_orderdate"), + "dimension": {"is_time": True}}, + ], + }], + }), + id="basic", + ), + # ── field types ─────────────────────────────────────────────────────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "id", "name": "id", "datatype": "number"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{"name": "id", "expression": _ansi("id")}], + }]}), + id="field-number", + ), + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{"name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}}], + }]}), + id="field-string", + ), + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "created_at", "name": "created_at", "datatype": "timestamp"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{"name": "created_at", "expression": _ansi("created_at"), + "dimension": {"is_time": True}}], + }]}), + id="field-timestamp", + ), + # ── labels → label + ai_context + custom_extension ─────────────────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "labels": ["sales", "reporting"]}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{ + "name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}, + "ai_context": {"synonyms": ["sales", "reporting"]}, + "label": "sales", + "custom_extensions": [ + {"vendor_name": "HONEYDEW", "data": '{"labels": ["sales", "reporting"]}'}, + ], + }], + }]}), + id="labels", + ), + # ── many-to-one relationship ────────────────────────────────────────────── + pytest.param( + "ws", + [ + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "customer_id", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "db.s.customers", "dataset_attrs": []}, + ], + _hd_root({ + "name": "ws", + "datasets": [ + {"name": "customers", "source": "db.s.customers", "primary_key": ["id"]}, + {"name": "orders", "source": "db.s.orders", "primary_key": ["order_id"]}, + ], + "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["customer_id"], "to_columns": ["id"]}], + }), + id="many-to-one", + ), + # ── one-to-many (direction flipped) ────────────────────────────────────── + pytest.param( + "ws", + [ + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "customer_id"}]}], + "dataset_attrs": []}, + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", + "sql": "db.s.orders", "dataset_attrs": []}, + ], + _hd_root({ + "name": "ws", + "datasets": [ + {"name": "customers", "source": "db.s.customers", "primary_key": ["id"]}, + {"name": "orders", "source": "db.s.orders", "primary_key": ["order_id"]}, + ], + "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["customer_id"], "to_columns": ["id"]}], + }), + id="one-to-many-flipped", + ), + # ── metric ──────────────────────────────────────────────────────────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "count", + "datatype": "number", "sql": "COUNT(*)"}]}], + _hd_root({"name": "ws", "datasets": [ + {"name": "orders", "source": "db.s.orders", "primary_key": ["id"]}, + ], "metrics": [{ + "name": "count", + "expression": _ansi("COUNT(*)"), + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "orders"}'}], + }]}), + id="metric", + ), + # ── calculated attribute → OSI field with HONEYDEW extension ───────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "discounted", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{ + "name": "discounted", + "expression": _ansi("orders.price * (1 - orders.discount)"), + "custom_extensions": [ + {"vendor_name": "HONEYDEW", + "data": '{"type": "calculated_attribute", "entity": "orders"}'}, + ], + }], + }]}), + id="calc-attr", + ), ]) -def test_honeydew_to_osi_field_types(tmp_path, col_name, datatype, expected_dim): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", - "dataset_attrs": [{"column": col_name, "name": col_name, "datatype": datatype}]}]) +def test_honeydew_to_osi_output(tmp_path, ws_name, entities, expected_root): + _write_workspace(str(tmp_path), ws_name, entities) result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} - assert fields[col_name].get("dimension") == expected_dim + assert result == expected_root -def test_honeydew_to_osi_labels_become_label_and_ai_context(tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", - "dataset_attrs": [ - {"column": "status", "name": "status", "datatype": "string", - "labels": ["sales", "reporting"]}, - ]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - f = next(f for f in result["semantic_model"][0]["datasets"][0]["fields"] if f["name"] == "status") - assert f["label"] == "sales" - assert "sales" in (f.get("ai_context") or {}).get("synonyms", []) +def test_honeydew_to_osi_missing_workspace_raises(tmp_path): + with pytest.raises(HoneydewConversionError, match="workspace.yml"): + convert_honeydew_to_osi(str(tmp_path)) -def test_honeydew_to_osi_many_to_one_relation(tmp_path): - _write_workspace(str(tmp_path), "ws", [ - {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection": [{"src_field": "customer_id", "target_field": "id"}]}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", "dataset_attrs": []}, - ]) +def test_honeydew_to_osi_missing_schema_dir_empty_model(tmp_path): + (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - rels = result["semantic_model"][0]["relationships"] - assert len(rels) == 1 - assert rels[0]["from"] == "orders" and rels[0]["to"] == "customers" + assert result == {"version": OSI_VERSION, "vendors": ["HONEYDEW"], + "semantic_model": [{"name": "ws", "datasets": []}]} -def test_honeydew_to_osi_one_to_many_direction_flipped(tmp_path): - _write_workspace(str(tmp_path), "ws", [ - {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", - "relations": [{"target_entity": "orders", "rel_type": "one-to-many", - "connection": [{"src_field": "id", "target_field": "customer_id"}]}], - "dataset_attrs": []}, - {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": []}, - ]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - rel = result["semantic_model"][0]["relationships"][0] - assert rel["from"] == "orders" and rel["to"] == "customers" +def test_honeydew_to_osi_empty_metric_sql_skipped(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "bad", + "datatype": "number", "sql": ""}]}]) + with warnings.catch_warnings(record=True): + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert "metrics" not in result["semantic_model"][0] def test_honeydew_to_osi_duplicate_relations_deduplicated(tmp_path): @@ -646,179 +799,129 @@ def test_honeydew_to_osi_duplicate_relations_deduplicated(tmp_path): assert len(result["semantic_model"][0].get("relationships", [])) == 1 -def test_honeydew_to_osi_metric_converted(tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "count", - "datatype": "number", "sql": "COUNT(*)"}]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - m = result["semantic_model"][0]["metrics"][0] - assert m["name"] == "count" - assert m["expression"]["dialects"][0]["expression"] == "COUNT(*)" - - -def test_honeydew_to_osi_metric_entity_preserved_in_extension(tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", - "datatype": "number", "sql": "COUNT(*)"}]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - m = result["semantic_model"][0]["metrics"][0] - ext = m["custom_extensions"][0] - assert ext["vendor_name"] == "HONEYDEW" - assert json.loads(ext["data"])["entity"] == "orders" - - -def test_honeydew_to_osi_calculated_attribute_as_field(tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "discounted", "datatype": "number", - "sql": "orders.price * (1 - orders.discount)"}]}]) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - fields = {f["name"]: f for f in result["semantic_model"][0]["datasets"][0]["fields"]} - assert "discounted" in fields - assert "orders.price" in fields["discounted"]["expression"]["dialects"][0]["expression"] - - -def test_honeydew_to_osi_missing_workspace_raises(tmp_path): - with pytest.raises(HoneydewConversionError, match="workspace.yml"): - convert_honeydew_to_osi(str(tmp_path)) - - -def test_honeydew_to_osi_missing_schema_dir_empty_model(tmp_path): - (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert result["semantic_model"][0]["datasets"] == [] - - -def test_honeydew_to_osi_vendors_includes_honeydew(tmp_path): - (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert "HONEYDEW" in result.get("vendors", []) - - -def test_honeydew_to_osi_empty_metric_sql_skipped(tmp_path): - _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], - "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "bad", - "datatype": "number", "sql": ""}]}]) - with warnings.catch_warnings(record=True): - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert "metrics" not in result["semantic_model"][0] - - # ───────────────────────────────────────────────────────────────────────────── -# OSI → Honeydew → OSI round-trip tests +# OSI → Honeydew → OSI round-trip: full semantic model # ───────────────────────────────────────────────────────────────────────────── -def test_osi_roundtrip_name_and_description(tmp_path): - model = {"name": "retail", "description": "Retail model", "datasets": []} - sm = _osi_roundtrip(model, tmp_path) - assert sm["name"] == "retail" and sm["description"] == "Retail model" - - -@pytest.mark.parametrize("primary_key", [ - ["order_id"], - ["order_id", "line_no"], +@pytest.mark.parametrize("model,expected_sm", [ + pytest.param( + {"name": "retail", "description": "Retail model", "datasets": []}, + {"name": "retail", "datasets": [], "description": "Retail model"}, + id="name-desc", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id"], "fields": []}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id"]}]}, + id="pk-single", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id", "line_no"], "fields": []}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id", "line_no"]}]}, + id="pk-composite", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["id"], + "unique_keys": [["sku"], ["id", "variant"]], + "fields": []}]}, + {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["id"], + "unique_keys": [["sku"], ["id", "variant"]]}]}, + id="unique-keys", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "label": "sales", + "expression": _ansi("status"), + "dimension": {"is_time": False}}]}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}, + "ai_context": {"synonyms": ["sales"]}, + "label": "sales"}]}]}, + id="field-label", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "total", + "expression": _ansi("total"), + "ai_context": {"instructions": "Use for revenue analysis", + "synonyms": ["revenue", "sales"]}}]}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "total", "expression": _ansi("total"), + "description": "Use for revenue analysis", + "ai_context": {"instructions": "Use for revenue analysis", + "synonyms": ["revenue", "sales"]}, + "label": "revenue", + "custom_extensions": [ + {"vendor_name": "HONEYDEW", + "data": '{"labels": ["revenue", "sales"]}'}, + ]}]}]}, + id="ai-context-dict", + ), + pytest.param( + {"name": "m", "ai_context": {"instructions": "Retail analytics", "synonyms": ["store"]}, + "datasets": []}, + {"name": "m", "datasets": [], + "ai_context": {"instructions": "Retail analytics", "synonyms": ["store"]}}, + id="model-ai-context", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}]}]}, + id="custom-ext", + ), + pytest.param( + {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]}, + {"name": "m", "datasets": [ + {"name": "customers", "source": "db.s.customers"}, + {"name": "orders", "source": "db.s.orders"}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]}, + id="relationship", + ), + pytest.param( + {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total_revenue", "description": "Sum of sales", + "expression": _ansi("SUM(orders.total)")}]}, + {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders"}], + "metrics": [{"name": "total_revenue", + "expression": _ansi("SUM(orders.total)"), + "custom_extensions": [ + {"vendor_name": "HONEYDEW", "data": '{"entity": "orders"}'}, + ], + "description": "Sum of sales"}]}, + id="metric", + ), + # ai_context string is merged into description and not stored in metadata for fields + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", + "expression": _ansi("status"), + "ai_context": "order status, order state", + "dimension": {"is_time": False}}]}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}, + "description": "order status, order state"}]}]}, + id="ai-context-string-becomes-desc", + ), ]) -def test_osi_roundtrip_primary_key(tmp_path, primary_key): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "primary_key": primary_key, "fields": []}]} - sm = _osi_roundtrip(model, tmp_path) - assert sm["datasets"][0]["primary_key"] == primary_key - - -def test_osi_roundtrip_unique_keys(tmp_path): - model = {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", - "primary_key": ["id"], - "unique_keys": [["sku"], ["id", "variant"]], - "fields": []}]} - sm = _osi_roundtrip(model, tmp_path) - assert sm["datasets"][0]["unique_keys"] == [["sku"], ["id", "variant"]] - - -def test_osi_roundtrip_field_label(tmp_path): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "fields": [{"name": "status", "label": "sales", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "dimension": {"is_time": False}}]}]} - sm = _osi_roundtrip(model, tmp_path) - f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") - assert f["label"] == "sales" - - -def test_osi_roundtrip_ai_context_string(tmp_path): - ai_ctx_value = "order status, order state" - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "fields": [{"name": "status", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, - "ai_context": ai_ctx_value, - "dimension": {"is_time": False}}]}]} - sm = _osi_roundtrip(model, tmp_path) - f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "status") - # String ai_context is merged into description on OSI→Honeydew; value must be recoverable - assert ai_ctx_value in (f.get("description") or "") or f.get("ai_context") == ai_ctx_value - - -def test_osi_roundtrip_ai_context_dict(tmp_path): - ctx = {"instructions": "Use for revenue analysis", "synonyms": ["revenue", "sales"]} - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "fields": [{"name": "total", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, - "ai_context": ctx}]}]} - sm = _osi_roundtrip(model, tmp_path) - f = next(f for f in sm["datasets"][0]["fields"] if f["name"] == "total") - assert f.get("ai_context") == ctx - - -def test_osi_roundtrip_model_ai_context(tmp_path): - ctx = {"instructions": "Retail analytics", "synonyms": ["store"]} - model = {"name": "m", "ai_context": ctx, "datasets": []} - sm = _osi_roundtrip(model, tmp_path) - assert sm.get("ai_context") == ctx - - -def test_osi_roundtrip_non_honeydew_custom_extensions(tmp_path): - model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", - "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], - "fields": []}]} - sm = _osi_roundtrip(model, tmp_path) - exts = sm["datasets"][0].get("custom_extensions") or [] - assert any(e["vendor_name"] == "SNOWFLAKE" for e in exts) - - -def test_osi_roundtrip_relationship_name(tmp_path): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - sm = _osi_roundtrip(model, tmp_path) - assert sm["relationships"][0]["name"] == "orders_to_customers" - - -def test_osi_roundtrip_relationship_columns(tmp_path): - model = {"name": "m", "datasets": [ - {"name": "orders", "source": "db.s.orders", "fields": []}, - {"name": "customers", "source": "db.s.customers", "fields": []}, - ], "relationships": [{"name": "r", "from": "orders", "to": "customers", - "from_columns": ["cid"], "to_columns": ["id"]}]} - sm = _osi_roundtrip(model, tmp_path) - rel = sm["relationships"][0] - assert rel["from_columns"] == ["cid"] and rel["to_columns"] == ["id"] - - -def test_osi_roundtrip_metric(tmp_path): - model = {"name": "m", - "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], - "metrics": [{"name": "total_revenue", "description": "Sum of sales", - "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} - sm = _osi_roundtrip(model, tmp_path) - m = sm["metrics"][0] - assert m["name"] == "total_revenue" - assert m["expression"]["dialects"][0]["expression"] == "SUM(orders.total)" - assert m["description"] == "Sum of sales" +def test_osi_roundtrip_sm(tmp_path, model, expected_sm): + assert _osi_roundtrip(model, tmp_path) == expected_sm def test_osi_roundtrip_tpcds_example(tmp_path): @@ -842,172 +945,253 @@ def test_osi_roundtrip_tpcds_example(tmp_path): # ───────────────────────────────────────────────────────────────────────────── -# Honeydew → OSI → Honeydew round-trip tests +# Honeydew → OSI → Honeydew round-trip: full file content # ───────────────────────────────────────────────────────────────────────────── -def test_honeydew_roundtrip_entity_name_and_keys(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["order_id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - }], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - assert entity["name"] == "orders" and entity["keys"] == ["order_id"] - - -def test_honeydew_roundtrip_source(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.SCHEMA.ORDERS", "dataset_attrs": [], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - assert ds["sql"] == "DB.SCHEMA.ORDERS" - - -def test_honeydew_roundtrip_column_attributes(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [ - {"column": "o_id", "name": "id", "datatype": "number"}, - {"column": "o_status", "name": "status", "datatype": "string"}, +@pytest.mark.parametrize("entities,path,expected", [ + # ── entity name + keys ──────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["order_id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": []}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["order_id"], + "key_dataset": "orders", "relations": []}, + id="entity-keys", + ), + # ── dataset source ──────────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.SCHEMA.ORDERS", "dataset_attrs": []}], + "schema/orders/datasets/orders.yml", + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.SCHEMA.ORDERS", "dataset_type": "table", "attributes": []}, + id="dataset-source", + ), + # ── column attributes ───────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "o_id", "name": "id", "datatype": "number"}, + {"column": "o_status", "name": "status", "datatype": "string"}, + ]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [ + {"column": "o_id", "name": "id", "datatype": "number"}, + {"column": "o_status", "name": "status", "datatype": "string"}, + ], + }, + id="column-attrs", + ), + # ── labels on column ────────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "labels": ["sales"]}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{ + "column": "status", "name": "status", "datatype": "string", + "labels": ["sales"], + "metadata": [{"name": "osi", "metadata": [ + {"name": "ai_context", "value": '{"synonyms": ["sales"]}'}, + ]}], + }], + }, + id="labels-on-column", + ), + # ── calculated attribute sql ─────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}]}], + "schema/orders/attributes/disc.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "orders.price * (1 - orders.discount)"}, + id="calc-attr-sql", + ), + # ── calc attr with simple identifier stays as calc_attr ────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "revenue", "datatype": "number", "sql": "revenue"}]}], + "schema/orders/attributes/revenue.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "revenue", + "datatype": "number", "sql": "revenue"}, + id="calc-simple-stays-calc", + ), + # ── metric entity assignment ────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}]}], + "schema/orders/metrics/cnt.yml", + {"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}, + id="metric", + ), + # ── many-to-one relation ────────────────────────────────────────────────── + pytest.param( + [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, ], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["id"]["column"] == "o_id" - assert attrs["status"]["datatype"] == "string" - - -def test_honeydew_roundtrip_labels_on_column(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", "labels": ["sales"]}], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert "sales" in attrs["status"].get("labels", []) - - -def test_honeydew_roundtrip_calculated_attribute_sql(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "disc", "datatype": "number", - "sql": "orders.price * (1 - orders.discount)"}], - }], tmp_path) - calc = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) - assert calc["sql"] == "orders.price * (1 - orders.discount)" - - -def test_honeydew_roundtrip_metric_entity_assignment(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", - "datatype": "number", "sql": "COUNT(*)"}], - }], tmp_path) - m = yaml.safe_load((out_dir / "schema/orders/metrics/cnt.yml").read_text()) - assert m["entity"] == "orders" and m["sql"] == "COUNT(*)" - - -def test_honeydew_roundtrip_relation(tmp_path): - out_dir = _honeydew_roundtrip([ - {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection": [{"src_field": "cid", "target_field": "id"}]}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", - "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, - ], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - assert entity["relations"][0]["target_entity"] == "customers" - assert entity["relations"][0]["connection"][0]["src_field"] == "cid" - - -def test_honeydew_roundtrip_bool_datatype(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", - "dataset_attrs": [{"column": "is_active", "name": "is_active", "datatype": "bool"}], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["is_active"]["datatype"] == "bool" - - -def test_honeydew_roundtrip_connection_expr(tmp_path): - out_dir = _honeydew_roundtrip([ - {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", - "relations": [{"target_entity": "customers", "rel_type": "many-to-one", - "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], - "dataset_attrs": []}, - {"name": "customers", "keys": ["id"], "key_dataset": "customers", - "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, - ], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - rel = entity["relations"][0] - assert rel.get("connection_expr", {}).get("sql") == "orders.cid = customers.id AND orders.region = customers.region" - - -@pytest.mark.parametrize("attr_extra,check_key,check_val", [ - ({"display_name": "Order Status"}, "display_name", "Order Status"), - ({"hidden": True}, "hidden", True), - ({"format_string": "##,###"}, "format_string", "##,###"), -]) -def test_honeydew_roundtrip_dataset_attr_honeydew_field(tmp_path, attr_extra, check_key, check_val): - attr = {"column": "status", "name": "status", "datatype": "string", **attr_extra} - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [attr], - }], tmp_path) - ds = yaml.safe_load((out_dir / "schema/orders/datasets/orders.yml").read_text()) - attrs = {a["name"]: a for a in ds["attributes"]} - assert attrs["status"][check_key] == check_val - - -@pytest.mark.parametrize("calc_extra,check_key,check_val", [ - ({"display_name": "Discounted Price"}, "display_name", "Discounted Price"), - ({"timegrain": "day"}, "timegrain", "day"), -]) -def test_honeydew_roundtrip_calc_attr_honeydew_field(tmp_path, calc_extra, check_key, check_val): - calc = {"type": "calculated_attribute", "entity": "orders", - "name": "disc", "datatype": "number", "sql": "orders.price * 0.9", **calc_extra} - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], "calc_attrs": [calc], - }], tmp_path) - result = yaml.safe_load((out_dir / "schema/orders/attributes/disc.yml").read_text()) - assert result[check_key] == check_val - - -@pytest.mark.parametrize("entity_extra,check_key,check_val", [ - ({"owner": "analytics_team"}, "owner", "analytics_team"), - ({"display_name": "Orders Table"}, "display_name", "Orders Table"), - ({"hidden": True}, "hidden", True), - ({"folder": "finance"}, "folder", "finance"), + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "keys": ["id"], + "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "name": "orders_to_customers", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + }, + id="relation", + ), + # ── connection_expr round-trip ──────────────────────────────────────────── + pytest.param( + [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "keys": ["id"], + "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "name": "orders_to_customers", + "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], + }, + id="connection-expr", + ), + # ── bool datatype ───────────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "is_active", "name": "is_active", "datatype": "bool"}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "is_active", "name": "is_active", "datatype": "bool"}], + }, + id="bool-datatype", + ), + # ── Honeydew-specific attribute fields ─────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "display_name": "Order Status"}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", "datatype": "string", + "display_name": "Order Status"}], + }, + id="attr-display-name", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "hidden": True}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", "datatype": "string", + "hidden": True}], + }, + id="attr-hidden", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "format_string": "##,###"}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", "datatype": "string", + "format_string": "##,###"}], + }, + id="attr-format-string", + ), + # ── Honeydew-specific calc attr fields ──────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * 0.9", "display_name": "Discounted Price"}]}], + "schema/orders/attributes/disc.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "orders.price * 0.9", "display_name": "Discounted Price"}, + id="calc-display-name", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * 0.9", "timegrain": "day"}]}], + "schema/orders/attributes/disc.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "orders.price * 0.9", "timegrain": "day"}, + id="calc-timegrain", + ), + # ── Honeydew-specific entity fields ────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "owner": "analytics_team"}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "owner": "analytics_team", "relations": []}, + id="entity-owner", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "display_name": "Orders Table"}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "display_name": "Orders Table", "relations": []}, + id="entity-display-name", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "hidden": True}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "hidden": True, "relations": []}, + id="entity-hidden", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "folder": "finance"}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "folder": "finance", "relations": []}, + id="entity-folder", + ), ]) -def test_honeydew_roundtrip_entity_honeydew_field(tmp_path, entity_extra, check_key, check_val): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], **entity_extra, - }], tmp_path) - entity = yaml.safe_load((out_dir / "schema/orders/orders.yml").read_text()) - assert entity.get(check_key) == check_val - - -def test_honeydew_roundtrip_calc_attr_simple_identifier_stays_calc(tmp_path): - out_dir = _honeydew_roundtrip([{ - "name": "orders", "keys": ["id"], "key_dataset": "orders", - "sql": "DB.S.ORDERS", "dataset_attrs": [], - "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", - "name": "revenue", "datatype": "number", "sql": "revenue"}], - }], tmp_path) - calc_path = out_dir / "schema/orders/attributes/revenue.yml" - assert calc_path.exists(), "calculated_attribute with simple-id sql should not become a dataset column" - calc = yaml.safe_load(calc_path.read_text()) - assert calc["sql"] == "revenue" +def test_honeydew_roundtrip_file(tmp_path, entities, path, expected): + out_dir = _honeydew_roundtrip(entities, tmp_path) + p = out_dir / path + assert p.exists(), f"Expected file {path!r} was not generated" + assert yaml.safe_load(p.read_text()) == expected # ───────────────────────────────────────────────────────────────────────────── @@ -1026,7 +1210,8 @@ def test_empty_or_whitespace_field_expression_skipped(expression): }]}]} files = convert_osi_to_honeydew(_osi(model)) ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert all(a["name"] != "bad" for a in ds["attributes"]) + assert ds == {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", "attributes": []} assert "schema/orders/attributes/bad.yml" not in files @@ -1053,7 +1238,8 @@ def test_non_dict_expression_warns(): files = convert_osi_to_honeydew(_osi(model)) assert any("must be a mapping" in str(x.message) for x in w) ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) - assert all(a["name"] != "bad" for a in ds["attributes"]) + assert ds == {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", "attributes": []} def test_duplicate_metric_name_warns(): @@ -1067,8 +1253,10 @@ def test_duplicate_metric_name_warns(): warnings.simplefilter("always") files = convert_osi_to_honeydew(_osi(model)) assert any("total" in str(x.message) for x in w) - m = yaml.safe_load(files["schema/orders/metrics/total.yml"]) - assert "orders.b" in m["sql"] + assert yaml.safe_load(files["schema/orders/metrics/total.yml"]) == { + "type": "metric", "entity": "orders", "name": "total", + "datatype": "number", "sql": "SUM(orders.b)", + } def test_metric_string_ai_context_preserved_in_roundtrip(tmp_path): @@ -1076,14 +1264,18 @@ def test_metric_string_ai_context_preserved_in_roundtrip(tmp_path): "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], "metrics": [{"name": "rev", "ai_context": "Use for revenue analysis", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} - files = convert_osi_to_honeydew(_osi(model)) - for rel_path, content in files.items(): - p = tmp_path / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - m = result["semantic_model"][0]["metrics"][0] - assert m.get("ai_context") == "Use for revenue analysis" + sm = _osi_roundtrip(model, tmp_path) + assert sm == { + "name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders"}], + "metrics": [{ + "name": "rev", + "expression": _ansi("SUM(orders.total)"), + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "orders"}'}], + "description": "Use for revenue analysis", + "ai_context": "Use for revenue analysis", + }], + } def test_malformed_osi_metadata_json_warns(tmp_path): @@ -1117,15 +1309,16 @@ def test_fields_to_honeydew_simple_identifier_goes_to_dataset(): fields = [{"name": "status", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, "dimension": {"is_time": False}}] dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") - assert len(dataset_attrs) == 1 and len(calc_attrs) == 0 - assert dataset_attrs[0]["column"] == "status" + assert dataset_attrs == [{"column": "status", "name": "status", "datatype": "string"}] + assert calc_attrs == [] def test_fields_to_honeydew_complex_sql_goes_to_calc(): fields = [{"name": "disc", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * 0.9"}]}}] dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") - assert len(dataset_attrs) == 0 and len(calc_attrs) == 1 - assert calc_attrs[0]["sql"] == "price * 0.9" + assert dataset_attrs == [] + assert calc_attrs == [{"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "price * 0.9"}] def test_fields_to_honeydew_missing_name_raises(): @@ -1153,34 +1346,25 @@ def test_connectionless_relation_warns(): warnings.simplefilter("always") files = convert_osi_to_honeydew(_osi(model)) assert any("resolve the join" in str(x.message) for x in w) - entity = yaml.safe_load(files["schema/orders/orders.yml"]) - assert entity["relations"][0]["target_entity"] == "customers" + assert yaml.safe_load(files["schema/orders/orders.yml"]) == { + "type": "entity", "name": "orders", "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", "name": "r"}], + } # ───────────────────────────────────────────────────────────────────────────── -# vendors round-trip +# Vendors round-trip # ───────────────────────────────────────────────────────────────────────────── -def test_vendors_roundtrip_preserves_non_honeydew(tmp_path): - doc = yaml.dump({ - "version": OSI_VERSION, - "vendors": ["SNOWFLAKE", "HONEYDEW"], - "semantic_model": [{"name": "m", "datasets": []}], - }) - files = convert_osi_to_honeydew(doc) - for rel_path, content in files.items(): - p = tmp_path / rel_path - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert "SNOWFLAKE" in result["vendors"] - assert "HONEYDEW" in result["vendors"] - - -def test_vendors_always_includes_honeydew(tmp_path): +@pytest.mark.parametrize("input_vendors,expected_vendors", [ + (["SNOWFLAKE", "HONEYDEW"], ["HONEYDEW", "SNOWFLAKE"]), + (["SNOWFLAKE"], ["HONEYDEW", "SNOWFLAKE"]), + (["HONEYDEW"], ["HONEYDEW"]), +]) +def test_vendors_roundtrip(tmp_path, input_vendors, expected_vendors): doc = yaml.dump({ "version": OSI_VERSION, - "vendors": ["SNOWFLAKE"], + "vendors": input_vendors, "semantic_model": [{"name": "m", "datasets": []}], }) files = convert_osi_to_honeydew(doc) @@ -1189,7 +1373,8 @@ def test_vendors_always_includes_honeydew(tmp_path): p.parent.mkdir(parents=True, exist_ok=True) p.write_text(content) result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) - assert result["vendors"][0] == "HONEYDEW" + assert result == {"version": OSI_VERSION, "vendors": expected_vendors, + "semantic_model": [{"name": "m", "datasets": []}]} # ───────────────────────────────────────────────────────────────────────────── @@ -1197,7 +1382,7 @@ def test_vendors_always_includes_honeydew(tmp_path): # ───────────────────────────────────────────────────────────────────────────── def test_main_osi_to_honeydew(tmp_path): - import subprocess, sys + import subprocess input_file = tmp_path / "model.yaml" input_file.write_text(yaml.dump({ "version": OSI_VERSION, @@ -1212,13 +1397,13 @@ def test_main_osi_to_honeydew(tmp_path): capture_output=True, text=True, ) assert result.returncode == 0 - assert (output_dir / "workspace.yml").exists() - ws = yaml.safe_load((output_dir / "workspace.yml").read_text()) - assert ws["name"] == "m" + assert yaml.safe_load((output_dir / "workspace.yml").read_text()) == { + "type": "workspace", "name": "m", + } def test_main_honeydew_to_osi(tmp_path): - import subprocess, sys + import subprocess _write_workspace(str(tmp_path), "ws", [{ "name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", "dataset_attrs": [], @@ -1230,14 +1415,17 @@ def test_main_honeydew_to_osi(tmp_path): capture_output=True, text=True, ) assert result.returncode == 0 - assert output_file.exists() - doc = yaml.safe_load(output_file.read_text()) - assert doc["semantic_model"][0]["name"] == "ws" + assert yaml.safe_load(output_file.read_text()) == { + "version": OSI_VERSION, + "vendors": ["HONEYDEW"], + "semantic_model": [{"name": "ws", "datasets": [ + {"name": "orders", "source": "DB.S.ORDERS", "primary_key": ["id"]}, + ]}], + } def test_main_path_traversal_rejected(tmp_path): - import subprocess, sys - # Entity name containing traversal sequences generates paths that escape output_dir + import subprocess input_file = tmp_path / "model.yaml" input_file.write_text( f"version: '{OSI_VERSION}'\nsemantic_model:\n" From 8627fb4d278fb74b406f5ff0c973c814138d9a73 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Wed, 27 May 2026 21:32:28 +0300 Subject: [PATCH 08/13] Address PR review: fix docstring and README inaccuracies - Docstring: relationship name is mapped directly to Honeydew's relation name field (not osi metadata); ai_context is mapped natively to description/labels/AI metadata, not treated as having no equivalent - README mapping table: rename rows to use Honeydew's canonical terms (Source Attribute, Calculated Attribute) and add missing ai_context row - README limitations: rewrite the confusing "One dataset per entity" bullet to clarify that OSI dataset = one table/query, Honeydew supports multiple dataset files per entity but the converter generates exactly one Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/README.md | 7 ++++--- converters/honeydew/src/honeydew_osi_converter.py | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/converters/honeydew/README.md b/converters/honeydew/README.md index 62e1e8b..ad3158a 100644 --- a/converters/honeydew/README.md +++ b/converters/honeydew/README.md @@ -17,8 +17,9 @@ Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models a | `dataset` | Entity + dataset files under `schema//` | | `dataset.source` | `dataset.sql` | | `dataset.primary_key` | `entity.keys` | -| Simple column field | `dataset.attributes` entry | -| Computed field expression | `calculated_attribute` YAML | +| Simple column field | Source Attribute (`dataset.attributes` entry) | +| Computed field expression | Calculated Attribute (`calculated_attribute` YAML) | +| `field.ai_context` | AI Metadata on the attribute or entity | | `relationship` (from → to) | `entity.relations` on the "from" entity (`rel_type: many-to-one`) | | `metric` | `metric` YAML (assigned to entity by expression parse) | @@ -66,7 +67,7 @@ python -m pytest tests/ ## Limitations -- **One dataset per entity**: The converter maps each OSI dataset to a single Honeydew entity with one source dataset. Multiple datasets per entity are not generated. +- **One source dataset per entity**: Honeydew entities can have multiple source dataset files; the converter always generates exactly one, because an OSI `dataset` block describes a single table or SQL query. - **Datatype inference**: OSI fields have no explicit datatype; the converter infers Honeydew datatypes from the `dimension.is_time` flag (`timestamp`) and the presence/absence of the `dimension` key (`string` vs `number`). - **Honeydew SQL expressions**: Calculated attributes and metrics use Honeydew's `entity.attribute` reference syntax. These are exported as `ANSI_SQL` dialect expressions in OSI; they remain valid for round-tripping but may not run on other databases without adaptation. - **Perspectives and domains**: Not converted (no OSI equivalent). diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index 3857bb7..b12a539 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -51,10 +51,12 @@ def convert_osi_to_honeydew(osi_yaml_str: str) -> dict[str, str]: attributes/.yml (computed fields only) metrics/.yml - OSI fields with no direct Honeydew equivalent (``ai_context``, - ``unique_keys``, non-Honeydew ``custom_extensions``, relationship - ``name``) are stored in the Honeydew ``metadata`` section under a section - named ``"osi"`` so they can be recovered on the return trip. + ``ai_context`` is mapped to Honeydew's native fields (``description``, + ``labels``, and the AI metadata section); the structured form is also + stored in ``metadata`` for lossless round-tripping. ``unique_keys`` and + non-Honeydew ``custom_extensions`` have no direct Honeydew equivalent and + are stored in the Honeydew ``metadata`` section under a section named + ``"osi"`` so they can be recovered on the return trip. Args: osi_yaml_str: OSI YAML document as a string. From ba2af82ad337dc8e8b9205e1e0c9687c34a5ef57 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Tue, 2 Jun 2026 09:52:29 -0700 Subject: [PATCH 09/13] Fix label idempotency bug and add multi-dataset warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Store OSI `label` in osi metadata when field has label but no dict ai_context, so the Honeydew→OSI path can distinguish OSI-originated labels from native Honeydew labels and avoid injecting spurious ai_context.synonyms on round-trip - Add `label` parameter to `_build_osi_metadata` and corresponding read support in `_read_osi_metadata` - Warn when an entity has more than one dataset file during Honeydew→OSI conversion (only the primary dataset is converted) - Update test expectations to reflect correct idempotent behavior Co-Authored-By: Claude Sonnet 4.6 --- .../honeydew/src/honeydew_osi_converter.py | 30 +++++++++++++++---- .../tests/test_honeydew_osi_converter.py | 6 ++-- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index b12a539..79e3772 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -188,6 +188,7 @@ def _fields_to_honeydew( field_meta = _build_osi_metadata( ai_context=field_ai_ctx if isinstance(field_ai_ctx, dict) else None, + label=field_label if field_label and not isinstance(field_ai_ctx, dict) else None, custom_extensions=field_ext or None, ) @@ -623,6 +624,12 @@ def _read_entity_dir(entity_dir: str, entity_name: str) -> dict[str, Any]: if fn.endswith((".yml", ".yaml")): with open(os.path.join(datasets_dir, fn)) as f: all_ds.append(yaml.safe_load(f) or {}) + if len(all_ds) > 1: + warnings.warn( + f"Entity '{entity_name}' has {len(all_ds)} dataset files; " + "only the primary dataset will be converted", + stacklevel=2, + ) for ds in all_ds: if ds.get("name") == data["key_dataset"] or data["primary_dataset"] is None: data["primary_dataset"] = ds @@ -700,14 +707,18 @@ def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: attr_osi_meta = _read_osi_metadata(attr) attr_labels = attr.get("labels") or [] - # Restore ai_context (structured form takes priority, else build from labels) + # Restore ai_context (structured form takes priority) + # Only synthesise synonyms from labels when they are native Honeydew labels + # (i.e. osi_meta has no 'label' key, meaning the label didn't come from OSI) if attr_osi_meta.get("ai_context"): field["ai_context"] = attr_osi_meta["ai_context"] - elif attr_labels: + elif attr_labels and "label" not in attr_osi_meta: field["ai_context"] = {"synonyms": list(attr_labels)} - # Restore label (first Honeydew label maps to OSI label) - if attr_labels: + # Restore label: prefer osi_meta (exact round-trip), else first Honeydew label + if "label" in attr_osi_meta: + field["label"] = attr_osi_meta["label"] + elif attr_labels: field["label"] = attr_labels[0] # Honeydew-specific metadata → HONEYDEW custom_extension @@ -754,10 +765,12 @@ def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: if calc_osi_meta.get("ai_context"): field["ai_context"] = calc_osi_meta["ai_context"] - elif calc_labels: + elif calc_labels and "label" not in calc_osi_meta: field["ai_context"] = {"synonyms": list(calc_labels)} - if calc_labels: + if "label" in calc_osi_meta: + field["label"] = calc_osi_meta["label"] + elif calc_labels: field["label"] = calc_labels[0] calc_honeydew_extra = { @@ -882,6 +895,7 @@ def _honeydew_metric_to_osi(metric: dict[str, Any], entity_name: str) -> dict[st def _build_osi_metadata( *, ai_context: Any = None, + label: str | None = None, unique_keys: Any = None, custom_extensions: list | None = None, extra_vendors: list[str] | None = None, @@ -892,6 +906,8 @@ def _build_osi_metadata( if ai_context is not None: val = ai_context if isinstance(ai_context, str) else json.dumps(ai_context) items.append({"name": "ai_context", "value": val}) + if label is not None: + items.append({"name": "label", "value": label}) if unique_keys: items.append({"name": "unique_keys", "value": json.dumps(unique_keys)}) if custom_extensions: @@ -918,6 +934,8 @@ def _read_osi_metadata(obj: dict[str, Any]) -> dict[str, Any]: result[key] = json.loads(raw) except (json.JSONDecodeError, TypeError): result[key] = raw + elif key == "label": + result[key] = raw elif key in ("unique_keys", "custom_extensions", "vendors"): try: result[key] = json.loads(raw) diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index 46e1e7a..a3c9dab 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -401,7 +401,10 @@ def test_check_safe_path(rel_path, expected): "type": "dataset", "entity": "orders", "name": "orders", "sql": "db.s.orders", "dataset_type": "table", "attributes": [{"column": "status", "name": "status", - "datatype": "string", "labels": ["sales"]}], + "datatype": "string", "labels": ["sales"], + "metadata": [{"name": "osi", "metadata": [ + {"name": "label", "value": "sales"} + ]}]}], }, id="label-in-attr", ), @@ -841,7 +844,6 @@ def test_honeydew_to_osi_duplicate_relations_deduplicated(tmp_path): {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{"name": "status", "expression": _ansi("status"), "dimension": {"is_time": False}, - "ai_context": {"synonyms": ["sales"]}, "label": "sales"}]}]}, id="field-label", ), From 5b69dd7e2d6ee6f607f678ed229b9ff21dff874a Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Tue, 2 Jun 2026 09:56:43 -0700 Subject: [PATCH 10/13] Fix label restoration to not fire when labels came from ai_context synonyms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a field has ai_context.synonyms but no label, the osi metadata section contains ai_context but no label key. Guard the label fallback on the absence of osi_meta ai_context so that synonyms don't get promoted to OSI label on round-trip. Verified with TPC-DS: zero semantic differences over OSI→HD→OSI→HD. Co-Authored-By: Claude Sonnet 4.6 --- converters/honeydew/src/honeydew_osi_converter.py | 5 +++-- converters/honeydew/tests/test_honeydew_osi_converter.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py index 79e3772..a3c92f1 100644 --- a/converters/honeydew/src/honeydew_osi_converter.py +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -716,9 +716,10 @@ def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: field["ai_context"] = {"synonyms": list(attr_labels)} # Restore label: prefer osi_meta (exact round-trip), else first Honeydew label + # Don't set label when labels came from ai_context.synonyms (osi_meta has ai_context) if "label" in attr_osi_meta: field["label"] = attr_osi_meta["label"] - elif attr_labels: + elif attr_labels and not attr_osi_meta.get("ai_context"): field["label"] = attr_labels[0] # Honeydew-specific metadata → HONEYDEW custom_extension @@ -770,7 +771,7 @@ def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: if "label" in calc_osi_meta: field["label"] = calc_osi_meta["label"] - elif calc_labels: + elif calc_labels and not calc_osi_meta.get("ai_context"): field["label"] = calc_labels[0] calc_honeydew_extra = { diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py index a3c9dab..d9a778e 100644 --- a/converters/honeydew/tests/test_honeydew_osi_converter.py +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -858,7 +858,6 @@ def test_honeydew_to_osi_duplicate_relations_deduplicated(tmp_path): "description": "Use for revenue analysis", "ai_context": {"instructions": "Use for revenue analysis", "synonyms": ["revenue", "sales"]}, - "label": "revenue", "custom_extensions": [ {"vendor_name": "HONEYDEW", "data": '{"labels": ["revenue", "sales"]}'}, From ae4f48e95db64a10b85d1aca4929962ab986a66c Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Tue, 2 Jun 2026 10:01:51 -0700 Subject: [PATCH 11/13] Add HONEYDEW to well-known vendors in spec and converters index Co-Authored-By: Claude Sonnet 4.6 --- converters/index.md | 12 +++++++----- core-spec/spec.md | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/converters/index.md b/converters/index.md index 6477291..5395865 100644 --- a/converters/index.md +++ b/converters/index.md @@ -18,11 +18,12 @@ OSI converters follow a **hub-and-spoke** architecture: │ ┌─────────────┐ ┌─────┴─────┐ ┌─────────────┐ │ dbt ├────┤ OSI ├────┤ Salesforce │ -└─────────────┘ └─────┬─────┘ └─────────────┘ - │ - ┌──────┴──────┐ - │ Databricks │ - └─────────────┘ +└─────────────┘ └──┬─────┬──┘ └─────────────┘ + │ │ + ┌────────┘ └────────┐ + ┌──────┴──────┐ ┌───────┴─────┐ + │ Databricks │ │ Honeydew │ + └─────────────┘ └─────────────┘ ``` This approach avoids the need for point-to-point converters between every pair of vendors. With N vendors, a point-to-point strategy would require N*(N-1) converters. With OSI as the hub, only 2*N converters are needed (one import and one export per vendor), and interoperability with all other vendors comes for free. @@ -54,6 +55,7 @@ The OSI specification currently defines extensions for the following vendors: | `SALESFORCE` | Salesforce / Tableau semantic layer | | `DBT` | dbt semantic models | | `DATABRICKS` | Databricks semantic layer | +| `HONEYDEW` | Honeydew workspace | Each vendor may define custom extensions (via the `custom_extensions` field in the OSI spec) to carry vendor-specific metadata that does not have an equivalent in the core specification. diff --git a/core-spec/spec.md b/core-spec/spec.md index 089f56d..ac410e3 100644 --- a/core-spec/spec.md +++ b/core-spec/spec.md @@ -54,6 +54,7 @@ The following are well-known examples: | `DBT` | dbt-specific attributes | | `DATABRICKS` | Databricks-specific attributes | | `GOODDATA` | GoodData-specific attributes | +| `HONEYDEW` | Honeydew-specific attributes | ## Semantic Model From 4303054193069d959068a30cf70c75ea5938d2ab Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Tue, 2 Jun 2026 10:11:15 -0700 Subject: [PATCH 12/13] Revert hub-and-spoke diagram change in converters/index.md Co-Authored-By: Claude Sonnet 4.6 --- converters/index.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/converters/index.md b/converters/index.md index 5395865..462d290 100644 --- a/converters/index.md +++ b/converters/index.md @@ -18,12 +18,11 @@ OSI converters follow a **hub-and-spoke** architecture: │ ┌─────────────┐ ┌─────┴─────┐ ┌─────────────┐ │ dbt ├────┤ OSI ├────┤ Salesforce │ -└─────────────┘ └──┬─────┬──┘ └─────────────┘ - │ │ - ┌────────┘ └────────┐ - ┌──────┴──────┐ ┌───────┴─────┐ - │ Databricks │ │ Honeydew │ - └─────────────┘ └─────────────┘ +└─────────────┘ └─────┬─────┘ └─────────────┘ + │ + ┌──────┴──────┐ + │ Databricks │ + └─────────────┘ ``` This approach avoids the need for point-to-point converters between every pair of vendors. With N vendors, a point-to-point strategy would require N*(N-1) converters. With OSI as the hub, only 2*N converters are needed (one import and one export per vendor), and interoperability with all other vendors comes for free. From 333bfb462d31f58d8b3d6538b923edf92b4e4851 Mon Sep 17 00:00:00 2001 From: Baruch Oxman Date: Tue, 2 Jun 2026 10:13:26 -0700 Subject: [PATCH 13/13] Revert HONEYDEW entry from converters/index.md supported vendors table Co-Authored-By: Claude Sonnet 4.6 --- converters/index.md | 1 - 1 file changed, 1 deletion(-) diff --git a/converters/index.md b/converters/index.md index 462d290..6477291 100644 --- a/converters/index.md +++ b/converters/index.md @@ -54,7 +54,6 @@ The OSI specification currently defines extensions for the following vendors: | `SALESFORCE` | Salesforce / Tableau semantic layer | | `DBT` | dbt semantic models | | `DATABRICKS` | Databricks semantic layer | -| `HONEYDEW` | Honeydew workspace | Each vendor may define custom extensions (via the `custom_extensions` field in the OSI spec) to carry vendor-specific metadata that does not have an equivalent in the core specification.