diff --git a/converters/honeydew/.gitignore b/converters/honeydew/.gitignore new file mode 100644 index 0000000..b5fed1f --- /dev/null +++ b/converters/honeydew/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.py[cod] +.pytest_cache/ +*.egg-info/ +dist/ +build/ +.venv/ +venv/ diff --git a/converters/honeydew/README.md b/converters/honeydew/README.md new file mode 100644 index 0000000..ad3158a --- /dev/null +++ b/converters/honeydew/README.md @@ -0,0 +1,75 @@ +# OSI ↔ Honeydew Converter + +Bidirectional converter between [OSI](../../core-spec/spec.md) semantic models and [Honeydew](https://honeydew.ai/docs) workspace YAML. + +## Overview + +| Direction | Input | Output | +|-----------|-------|--------| +| `osi-to-honeydew` | Single OSI YAML file | Honeydew workspace directory | +| `honeydew-to-osi` | Honeydew workspace directory | Single OSI YAML file | + +### OSI → Honeydew mapping + +| OSI concept | Honeydew concept | +|-------------|-----------------| +| `semantic_model.name` | `workspace.yml name` | +| `dataset` | Entity + dataset files under `schema//` | +| `dataset.source` | `dataset.sql` | +| `dataset.primary_key` | `entity.keys` | +| Simple column field | Source Attribute (`dataset.attributes` entry) | +| Computed field expression | Calculated Attribute (`calculated_attribute` YAML) | +| `field.ai_context` | AI Metadata on the attribute or entity | +| `relationship` (from → to) | `entity.relations` on the "from" entity (`rel_type: many-to-one`) | +| `metric` | `metric` YAML (assigned to entity by expression parse) | + +### Honeydew → OSI mapping + +| Honeydew concept | OSI concept | +|-----------------|-------------| +| `workspace.name` | `semantic_model.name` | +| Entity + primary dataset | `dataset` | +| `entity.keys` | `dataset.primary_key` | +| `dataset.attributes` (columns) | `fields` with `ANSI_SQL` expression = column name | +| `calculated_attribute` SQL | `fields` with `ANSI_SQL` expression + `HONEYDEW` custom extension | +| `entity.relations` (`many-to-one`) | `relationship` with `from` = this entity | +| `entity.relations` (`one-to-many`) | `relationship` with `from` = target entity | +| `metric.sql` | `metric` expression in `ANSI_SQL` dialect | + +## Requirements + +- Python 3.12+ +- PyYAML 6.0+ + +## Setup + +```bash +pip install . +# or in editable mode for development: +pip install -e . +``` + +## Usage + +```bash +# OSI YAML → Honeydew workspace directory +python src/honeydew_osi_converter.py osi-to-honeydew -i input.yaml -o output_dir/ + +# Honeydew workspace directory → OSI YAML +python src/honeydew_osi_converter.py honeydew-to-osi -i workspace_dir/ -o output.yaml +``` + +## Tests + +```bash +python -m pytest tests/ +``` + +## Limitations + +- **One source dataset per entity**: Honeydew entities can have multiple source dataset files; the converter always generates exactly one, because an OSI `dataset` block describes a single table or SQL query. +- **Datatype inference**: OSI fields have no explicit datatype; the converter infers Honeydew datatypes from the `dimension.is_time` flag (`timestamp`) and the presence/absence of the `dimension` key (`string` vs `number`). +- **Honeydew SQL expressions**: Calculated attributes and metrics use Honeydew's `entity.attribute` reference syntax. These are exported as `ANSI_SQL` dialect expressions in OSI; they remain valid for round-tripping but may not run on other databases without adaptation. +- **Perspectives and domains**: Not converted (no OSI equivalent). +- **Connection expressions** (`connection_expr`): Preserved in `HONEYDEW` custom extensions on the OSI relationship and restored on the return trip. +- **`ai_context`**: OSI `ai_context` fields (synonyms, instructions) are stored in Honeydew `metadata` for round-trip recovery. Instructions are also merged into `description` for human readability. diff --git a/converters/honeydew/pyproject.toml b/converters/honeydew/pyproject.toml new file mode 100644 index 0000000..82df220 --- /dev/null +++ b/converters/honeydew/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "honeydew-osi" +version = "0.2.0.dev0" +description = "Bidirectional converter between Honeydew workspace YAML and OSI semantic model" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "pyyaml>=6.0", +] + +[dependency-groups] +dev = [ + "pytest>=8.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["src/**/*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/converters/honeydew/src/honeydew_osi_converter.py b/converters/honeydew/src/honeydew_osi_converter.py new file mode 100644 index 0000000..a3c92f1 --- /dev/null +++ b/converters/honeydew/src/honeydew_osi_converter.py @@ -0,0 +1,1037 @@ +""" +Bidirectional converter between OSI and Honeydew semantic model formats. + +OSI → Honeydew: Converts a single OSI YAML file into a Honeydew workspace + directory (multiple YAML files per entity). + +Honeydew → OSI: Reads a Honeydew workspace directory and produces an OSI YAML. + +Usage: + python honeydew_osi_converter.py osi-to-honeydew -i input.yaml -o output_dir/ + python honeydew_osi_converter.py honeydew-to-osi -i workspace_dir/ -o output.yaml +""" + +import argparse +import json +import os +import re +import sys +import warnings +from typing import Any + +import yaml + +SUPPORTED_OSI_VERSION = "0.2.0.dev0" +HONEYDEW_VENDOR = "HONEYDEW" +_OSI_METADATA_SECTION = "osi" +_HD_ATTR_KEYS = ("display_name", "hidden", "folder", "format_string", "timegrain") + + +class HoneydewConversionError(Exception): + """Raised when conversion between OSI and Honeydew fails.""" + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI → Honeydew +# ───────────────────────────────────────────────────────────────────────────── + + +def convert_osi_to_honeydew(osi_yaml_str: str) -> dict[str, str]: + """Convert an OSI YAML string to a Honeydew workspace file tree. + + Returns a dict mapping relative file paths to their YAML content strings. + The caller writes these to disk under the desired output directory. + + Honeydew workspace structure produced:: + + workspace.yml + schema// + .yml + datasets/.yml + attributes/.yml (computed fields only) + metrics/.yml + + ``ai_context`` is mapped to Honeydew's native fields (``description``, + ``labels``, and the AI metadata section); the structured form is also + stored in ``metadata`` for lossless round-tripping. ``unique_keys`` and + non-Honeydew ``custom_extensions`` have no direct Honeydew equivalent and + are stored in the Honeydew ``metadata`` section under a section named + ``"osi"`` so they can be recovered on the return trip. + + Args: + osi_yaml_str: OSI YAML document as a string. + + Returns: + Dict of {relative_path: yaml_content}. + + Raises: + HoneydewConversionError: On invalid or unsupported input. + """ + root = yaml.safe_load(osi_yaml_str) + if not isinstance(root, dict): + raise HoneydewConversionError("Invalid OSI YAML: expected a mapping at the root") + + version_str = str(root.get("version", "")) + if version_str != SUPPORTED_OSI_VERSION: + raise HoneydewConversionError( + f"Unsupported OSI version '{version_str}'. Supported: {SUPPORTED_OSI_VERSION}" + ) + + semantic_models = root.get("semantic_model") + if not isinstance(semantic_models, list) or not semantic_models: + raise HoneydewConversionError("'semantic_model' must be a non-empty list") + + if len(semantic_models) > 1: + warnings.warn( + f"OSI YAML contains {len(semantic_models)} semantic models; " + "only the first will be converted" + ) + + vendors = [v for v in (root.get("vendors") or []) if v != HONEYDEW_VENDOR] + return _model_to_files(semantic_models[0], extra_vendors=vendors) + + +def _model_to_files(sm: dict[str, Any], *, extra_vendors: list[str] | None = None) -> dict[str, str]: + name = sm.get("name") + if not name: + raise HoneydewConversionError("Missing 'name' in semantic model") + + files: dict[str, str] = {} + + workspace: dict[str, Any] = {"type": "workspace", "name": name} + if sm.get("description"): + workspace["description"] = sm["description"] + + # Preserve model-level ai_context, non-HONEYDEW custom_extensions, and extra vendors + model_ai_ctx = sm.get("ai_context") + model_ext = [e for e in (sm.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + ws_meta = _build_osi_metadata( + ai_context=model_ai_ctx, + custom_extensions=model_ext or None, + extra_vendors=extra_vendors or None, + ) + if ws_meta: + workspace["metadata"] = [ws_meta] + + files["workspace.yml"] = _dump(workspace) + + datasets = sm.get("datasets") or [] + metrics = sm.get("metrics") or [] + relationships = sm.get("relationships") or [] + + entity_names = [ds["name"] for ds in datasets if ds.get("name")] + + # Group OSI relationships by from-entity + rel_by_entity: dict[str, list[dict[str, Any]]] = {} + for rel in relationships: + from_ds = rel.get("from") + if from_ds: + rel_by_entity.setdefault(from_ds, []).append(rel) + + # Assign OSI metrics to entities (honours HONEYDEW entity hint for round-trips) + metric_by_entity = _assign_metrics_to_entities(metrics, entity_names) + + for ds in datasets: + entity_name = ds.get("name") + if not entity_name: + raise HoneydewConversionError("Dataset missing 'name'") + files.update( + _dataset_to_files( + ds, + rel_by_entity.get(entity_name, []), + metric_by_entity.get(entity_name, []), + ) + ) + + return files + + +def _fields_to_honeydew( + fields: list[dict[str, Any]], + entity_name: str, +) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Classify OSI fields into Honeydew dataset attributes and calculated attributes.""" + dataset_attrs: list[dict[str, Any]] = [] + calc_attrs: list[dict[str, Any]] = [] + + for field in fields: + field_name = field.get("name") + if not field_name: + raise HoneydewConversionError(f"Field missing 'name' in dataset '{entity_name}'") + + expr = _pick_ansi_expression(field.get("expression"), field_name) + if not expr or not expr.strip(): + continue + + datatype = _osi_field_to_honeydew_datatype(field) + field_desc = field.get("description") + field_label = field.get("label") + field_ai_ctx = field.get("ai_context") + field_ext = [e for e in (field.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + + # Merge ai_context instructions into description; keep full object in metadata + effective_desc = field_desc + if isinstance(field_ai_ctx, str) and field_ai_ctx: + effective_desc = f"{field_desc}\n{field_ai_ctx}" if field_desc else field_ai_ctx + elif isinstance(field_ai_ctx, dict) and field_ai_ctx.get("instructions"): + instr = field_ai_ctx["instructions"] + effective_desc = f"{field_desc}\n{instr}" if field_desc else instr + + # Build labels: OSI label + ai_context synonyms + labels: list[str] = [] + if field_label: + labels.append(field_label) + if isinstance(field_ai_ctx, dict): + for syn in (field_ai_ctx.get("synonyms") or []): + if syn not in labels: + labels.append(syn) + + field_meta = _build_osi_metadata( + ai_context=field_ai_ctx if isinstance(field_ai_ctx, dict) else None, + label=field_label if field_label and not isinstance(field_ai_ctx, dict) else None, + custom_extensions=field_ext or None, + ) + + hd_hint = _get_honeydew_extension(field) + force_calc = hd_hint.get("type") == "calculated_attribute" + + if _is_simple_identifier(expr) and not force_calc: + attr: dict[str, Any] = {"column": expr, "name": field_name, "datatype": datatype} + if effective_desc: + attr["description"] = effective_desc + if labels: + attr["labels"] = labels + if field_meta: + attr["metadata"] = [field_meta] + for k in _HD_ATTR_KEYS: + if k in hd_hint: + attr[k] = hd_hint[k] + dataset_attrs.append(attr) + else: + calc: dict[str, Any] = { + "type": "calculated_attribute", + "entity": entity_name, + "name": field_name, + "datatype": datatype, + "sql": expr, + } + if effective_desc: + calc["description"] = effective_desc + if labels: + calc["labels"] = labels + if field_meta: + calc["metadata"] = [field_meta] + for k in _HD_ATTR_KEYS: + if k in hd_hint: + calc[k] = hd_hint[k] + calc_attrs.append(calc) + + return dataset_attrs, calc_attrs + + +def _dataset_to_files( + ds: dict[str, Any], + relations: list[dict[str, Any]], + metrics: list[dict[str, Any]], +) -> dict[str, str]: + entity_name = ds["name"] + base = f"schema/{entity_name}" + files: dict[str, str] = {} + + primary_key = ds.get("primary_key") or [] + unique_keys = ds.get("unique_keys") + description = ds.get("description") + ai_context = ds.get("ai_context") + fields = ds.get("fields") or [] + ds_ext = [e for e in (ds.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + + # ── entity YAML ──────────────────────────────────────────────────────────── + entity_dict: dict[str, Any] = {"type": "entity", "name": entity_name} + if description: + entity_dict["description"] = description + if primary_key: + entity_dict["keys"] = list(primary_key) + entity_dict["key_dataset"] = entity_name + + # Restore Honeydew-specific entity fields from HONEYDEW custom_extension + entity_hd_hint = _get_honeydew_extension(ds) + for key in ("owner", "display_name", "hidden", "folder"): + if key in entity_hd_hint: + entity_dict[key] = entity_hd_hint[key] + if "labels" in entity_hd_hint: + entity_dict["labels"] = entity_hd_hint["labels"] + + honeydew_relations = [] + for rel in relations: + hr = _osi_relation_to_honeydew(rel) + if hr is not None: + honeydew_relations.append(hr) + entity_dict["relations"] = honeydew_relations + + # Preserve OSI fields that have no Honeydew native equivalent + entity_meta = _build_osi_metadata( + ai_context=ai_context, + unique_keys=unique_keys, + custom_extensions=ds_ext or None, + ) + if entity_meta: + entity_dict["metadata"] = [entity_meta] + + files[f"{base}/{entity_name}.yml"] = _dump(entity_dict) + + # ── classify fields into dataset attributes vs calculated attributes ──────── + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, entity_name) + + # ── dataset YAML ─────────────────────────────────────────────────────────── + source_sql, dataset_type = _parse_osi_source(ds.get("source", "")) + dataset_dict: dict[str, Any] = { + "type": "dataset", + "entity": entity_name, + "name": entity_name, + "sql": source_sql, + "dataset_type": dataset_type, + "attributes": dataset_attrs, + } + if description: + dataset_dict["description"] = description + + files[f"{base}/datasets/{entity_name}.yml"] = _dump(dataset_dict) + + # ── calculated_attribute YAMLs ───────────────────────────────────────────── + for calc in calc_attrs: + files[f"{base}/attributes/{calc['name']}.yml"] = _dump(calc) + + # ── metric YAMLs ──────────────────────────────────────────────────────────── + for metric in metrics: + mname = metric.get("name") + if not mname: + continue + mexpr = _pick_ansi_expression(metric.get("expression"), mname) + if not mexpr or not mexpr.strip(): + continue + + metric_dict: dict[str, Any] = { + "type": "metric", + "entity": entity_name, + "name": mname, + "datatype": "number", + "sql": mexpr, + } + if metric.get("description"): + metric_dict["description"] = metric["description"] + + metric_ai_ctx = metric.get("ai_context") + if isinstance(metric_ai_ctx, str) and metric_ai_ctx: + existing = metric_dict.get("description", "") + metric_dict["description"] = f"{existing}\n{metric_ai_ctx}".strip() if existing else metric_ai_ctx + + metric_ext = [e for e in (metric.get("custom_extensions") or []) if e.get("vendor_name") != HONEYDEW_VENDOR] + metric_meta = _build_osi_metadata( + ai_context=metric_ai_ctx, + custom_extensions=metric_ext or None, + ) + if metric_meta: + metric_dict["metadata"] = [metric_meta] + + metric_path = f"{base}/metrics/{mname}.yml" + if metric_path in files: + warnings.warn( + f"Metric '{mname}' in entity '{entity_name}' is defined more than once; later definition wins" + ) + files[metric_path] = _dump(metric_dict) + + return files + + +def _osi_relation_to_honeydew(rel: dict[str, Any]) -> dict[str, Any] | None: + rel_name = rel.get("name", "") + to_ds = rel.get("to") + if not to_ds: + warnings.warn(f"Relationship '{rel_name}' missing 'to', skipping") + return None + + from_cols = rel.get("from_columns") or [] + to_cols = rel.get("to_columns") or [] + + if len(from_cols) != len(to_cols): + raise HoneydewConversionError( + f"Relationship '{rel_name}': from_columns and to_columns length mismatch " + f"({len(from_cols)} vs {len(to_cols)})" + ) + + honeydew_rel: dict[str, Any] = { + "target_entity": to_ds, + "rel_type": "many-to-one", + } + if rel.get("name"): + honeydew_rel["name"] = rel["name"] + if from_cols: + honeydew_rel["connection"] = [ + {"src_field": fc, "target_field": tc} + for fc, tc in zip(from_cols, to_cols) + ] + else: + hd_ext = _get_honeydew_extension(rel) + if hd_ext.get("connection_expr"): + honeydew_rel["connection_expr"] = {"sql": hd_ext["connection_expr"]} + else: + warnings.warn( + f"Relationship '{rel_name}' has no from_columns and no connection_expr; " + "Honeydew will not be able to resolve the join" + ) + return honeydew_rel + + +def _pick_ansi_expression(expression: Any, field_name: str) -> str | None: + """Select the ANSI_SQL expression; fall back to first available dialect.""" + if expression is None: + return None + if not isinstance(expression, dict): + warnings.warn(f"'{field_name}': 'expression' must be a mapping; field will be skipped") + return None + dialects = expression.get("dialects") or [] + if not dialects: + return None + + ansi_expr = None + first_expr = None + + for d in dialects: + dialect = (d.get("dialect") or "").upper() + expr = d.get("expression") + if first_expr is None: + first_expr = expr + if dialect == "ANSI_SQL" and ansi_expr is None: + ansi_expr = expr + + if ansi_expr is not None: + return ansi_expr + + if first_expr is not None: + warnings.warn(f"'{field_name}': no ANSI_SQL dialect found; using first available") + return first_expr + + return None + + +def _osi_field_to_honeydew_datatype(field: dict[str, Any]) -> str: + hd_ext = _get_honeydew_extension(field) + if hd_ext.get("datatype"): + return hd_ext["datatype"] + dimension = field.get("dimension") + if isinstance(dimension, dict) and dimension.get("is_time"): + return "timestamp" + if dimension is not None: + return "string" + return "number" + + +def _is_simple_identifier(expr: str) -> bool: + return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", expr.strip())) + + +def _parse_osi_source(source: str) -> tuple[str, str]: + source = (source or "").strip() + if not source: + return ("", "table") + upper = source.upper() + if upper.startswith(("SELECT ", "SELECT\n", "SELECT\t", "WITH ", "WITH\n", "WITH\t")): + return (source, "sql") + return (source, "table") + + +def _assign_metrics_to_entities( + metrics: list[dict[str, Any]], + entity_names: list[str], +) -> dict[str, list[dict[str, Any]]]: + """Assign each OSI metric to the most appropriate Honeydew entity. + + Priority: + 1. HONEYDEW ``custom_extension`` entity hint (preserves round-trip placement) + 2. First ``entity.column`` pattern in the ANSI_SQL expression + 3. First entity in the model (with a warning) + """ + entity_set = set(entity_names) + result: dict[str, list[dict[str, Any]]] = {} + + for metric in metrics: + mname = metric.get("name", "") + + # Priority 1: HONEYDEW entity hint (set during Honeydew → OSI) + hinted = _get_honeydew_extension(metric).get("entity") + if hinted and hinted in entity_set: + result.setdefault(hinted, []).append(metric) + continue + + # Priority 2: expression scan + expr_dict = metric.get("expression") or {} + dialects = expr_dict.get("dialects") or [] if isinstance(expr_dict, dict) else [] + expr_str = "" + for d in dialects: + if (d.get("dialect") or "").upper() == "ANSI_SQL": + expr_str = d.get("expression") or "" + break + if not expr_str and dialects: + expr_str = dialects[0].get("expression") or "" + + assigned = _find_entity_in_expression(expr_str, entity_set) + + # Priority 3: fallback + if assigned is None: + if entity_names: + assigned = entity_names[0] + warnings.warn( + f"Metric '{mname}': no entity reference found in expression; " + f"assigning to '{assigned}'" + ) + else: + warnings.warn(f"Metric '{mname}': no entities to assign to, skipping") + continue + + result.setdefault(assigned, []).append(metric) + + return result + + +def _find_entity_in_expression(expr: str, entity_names: set[str]) -> str | None: + for match in re.finditer(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\.([a-zA-Z_][a-zA-Z0-9_]*)\b", expr): + if match.group(1) in entity_names: + return match.group(1) + return None + + +# ───────────────────────────────────────────────────────────────────────────── +# Honeydew → OSI +# ───────────────────────────────────────────────────────────────────────────── + + +def convert_honeydew_to_osi(workspace_dir: str) -> str: + """Convert a Honeydew workspace directory to an OSI YAML string. + + Reads workspace.yml and all entity subdirectories under schema/. Honeydew + fields with no OSI equivalent (``owner``, ``display_name``, ``hidden``, + ``format_string``, ``timegrain``, attribute ``labels``) are preserved in a + HONEYDEW ``custom_extension`` so they survive a round-trip back to Honeydew. + + Args: + workspace_dir: Path to the Honeydew workspace root. + + Returns: + OSI YAML document string. + + Raises: + HoneydewConversionError: On missing workspace.yml. + """ + workspace_path = os.path.join(workspace_dir, "workspace.yml") + if not os.path.exists(workspace_path): + raise HoneydewConversionError(f"workspace.yml not found in '{workspace_dir}'") + + with open(workspace_path) as f: + workspace = yaml.safe_load(f) or {} + + model_name = workspace.get("name") or os.path.basename(workspace_dir.rstrip("/\\")) + model_description = workspace.get("description") + ws_osi_meta = _read_osi_metadata(workspace) + + schema_dir = os.path.join(workspace_dir, "schema") + entity_dirs: list[str] = [] + if os.path.isdir(schema_dir): + entity_dirs = sorted( + d for d in os.listdir(schema_dir) + if os.path.isdir(os.path.join(schema_dir, d)) + ) + + osi_datasets: list[dict[str, Any]] = [] + osi_relationships: list[dict[str, Any]] = [] + osi_metrics: list[dict[str, Any]] = [] + seen_relationships: set[tuple] = set() + + for entity_name in entity_dirs: + entity_dir = os.path.join(schema_dir, entity_name) + entity_data = _read_entity_dir(entity_dir, entity_name) + + osi_datasets.append(_entity_to_osi_dataset(entity_data)) + + for rel in entity_data["relations"]: + osi_rel = _honeydew_relation_to_osi( + rel, entity_name, seen_relationships + ) + if osi_rel is not None: + osi_relationships.append(osi_rel) + + for metric in entity_data["metrics"]: + osi_m = _honeydew_metric_to_osi(metric, entity_name) + if osi_m is not None: + osi_metrics.append(osi_m) + + sm: dict[str, Any] = {"name": model_name, "datasets": osi_datasets} + if model_description: + sm["description"] = str(model_description).strip() + if ws_osi_meta.get("ai_context"): + sm["ai_context"] = ws_osi_meta["ai_context"] + + restored_ws_ext = ws_osi_meta.get("custom_extensions") or [] + if restored_ws_ext: + sm["custom_extensions"] = restored_ws_ext + + if osi_relationships: + sm["relationships"] = osi_relationships + if osi_metrics: + sm["metrics"] = osi_metrics + + extra_vendors = ws_osi_meta.get("vendors") or [] + vendors = [HONEYDEW_VENDOR] + [v for v in extra_vendors if v != HONEYDEW_VENDOR] + root: dict[str, Any] = { + "version": SUPPORTED_OSI_VERSION, + "vendors": vendors, + "semantic_model": [sm], + } + return _dump(root) + + +def _read_entity_dir(entity_dir: str, entity_name: str) -> dict[str, Any]: + data: dict[str, Any] = { + "name": entity_name, + "description": None, + "keys": [], + "key_dataset": None, + "relations": [], + "primary_dataset": None, + "calculated_attributes": [], + "metrics": [], + "osi_meta": {}, + "honeydew_extra": {}, + } + + entity_yml = os.path.join(entity_dir, f"{entity_name}.yml") + if os.path.exists(entity_yml): + with open(entity_yml) as f: + ey = yaml.safe_load(f) or {} + data["keys"] = _coerce_list(ey.get("keys")) + data["description"] = ey.get("description") + data["key_dataset"] = ey.get("key_dataset") + data["relations"] = ey.get("relations") or [] + data["osi_meta"] = _read_osi_metadata(ey) + data["honeydew_extra"] = { + k: ey[k] for k in ("owner", "display_name", "hidden", "folder", "labels") + if k in ey + } + + datasets_dir = os.path.join(entity_dir, "datasets") + if os.path.isdir(datasets_dir): + all_ds: list[dict[str, Any]] = [] + for fn in sorted(os.listdir(datasets_dir)): + if fn.endswith((".yml", ".yaml")): + with open(os.path.join(datasets_dir, fn)) as f: + all_ds.append(yaml.safe_load(f) or {}) + if len(all_ds) > 1: + warnings.warn( + f"Entity '{entity_name}' has {len(all_ds)} dataset files; " + "only the primary dataset will be converted", + stacklevel=2, + ) + for ds in all_ds: + if ds.get("name") == data["key_dataset"] or data["primary_dataset"] is None: + data["primary_dataset"] = ds + if ds.get("name") == data["key_dataset"]: + break + + attrs_dir = os.path.join(entity_dir, "attributes") + if os.path.isdir(attrs_dir): + for fn in sorted(os.listdir(attrs_dir)): + if fn.endswith((".yml", ".yaml")): + with open(os.path.join(attrs_dir, fn)) as f: + data["calculated_attributes"].append(yaml.safe_load(f) or {}) + + metrics_dir = os.path.join(entity_dir, "metrics") + if os.path.isdir(metrics_dir): + for fn in sorted(os.listdir(metrics_dir)): + if fn.endswith((".yml", ".yaml")): + with open(os.path.join(metrics_dir, fn)) as f: + data["metrics"].append(yaml.safe_load(f) or {}) + + return data + + +def _entity_to_osi_dataset(entity_data: dict[str, Any]) -> dict[str, Any]: + entity_name = entity_data["name"] + ds: dict[str, Any] = {"name": entity_name} + + if entity_data.get("description"): + ds["description"] = str(entity_data["description"]).strip() + + primary_ds = entity_data.get("primary_dataset") + ds["source"] = (primary_ds.get("sql") or "").strip() if primary_ds else entity_name + + keys = entity_data.get("keys") or [] + if keys: + ds["primary_key"] = list(keys) + + # Restore OSI-only fields preserved in Honeydew metadata + osi_meta = entity_data.get("osi_meta") or {} + if osi_meta.get("ai_context"): + ds["ai_context"] = osi_meta["ai_context"] + if osi_meta.get("unique_keys"): + ds["unique_keys"] = osi_meta["unique_keys"] + + restored_ext = list(osi_meta.get("custom_extensions") or []) + honeydew_extra = entity_data.get("honeydew_extra") or {} + if honeydew_extra: + restored_ext.append({"vendor_name": HONEYDEW_VENDOR, "data": json.dumps(honeydew_extra)}) + if restored_ext: + ds["custom_extensions"] = restored_ext + + # Build fields + fields: list[dict[str, Any]] = [] + seen: set[str] = set() + + if primary_ds: + for attr in primary_ds.get("attributes") or []: + col = attr.get("column") or attr.get("name") or "" + aname = attr.get("name") or col + if not aname or aname in seen: + continue + seen.add(aname) + + field: dict[str, Any] = { + "name": aname, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": col}]}, + } + datatype = attr.get("datatype") or "string" + dim = _honeydew_datatype_to_osi_dimension(datatype) + if dim is not None: + field["dimension"] = dim + if attr.get("description"): + field["description"] = str(attr["description"]).strip() + + attr_osi_meta = _read_osi_metadata(attr) + attr_labels = attr.get("labels") or [] + + # Restore ai_context (structured form takes priority) + # Only synthesise synonyms from labels when they are native Honeydew labels + # (i.e. osi_meta has no 'label' key, meaning the label didn't come from OSI) + if attr_osi_meta.get("ai_context"): + field["ai_context"] = attr_osi_meta["ai_context"] + elif attr_labels and "label" not in attr_osi_meta: + field["ai_context"] = {"synonyms": list(attr_labels)} + + # Restore label: prefer osi_meta (exact round-trip), else first Honeydew label + # Don't set label when labels came from ai_context.synonyms (osi_meta has ai_context) + if "label" in attr_osi_meta: + field["label"] = attr_osi_meta["label"] + elif attr_labels and not attr_osi_meta.get("ai_context"): + field["label"] = attr_labels[0] + + # Honeydew-specific metadata → HONEYDEW custom_extension + attr_honeydew_extra = { + k: attr[k] for k in ("display_name", "hidden", "folder", "format_string", "timegrain") + if k in attr + } + if datatype == "bool": + attr_honeydew_extra["datatype"] = datatype + if len(attr_labels) > 1: + attr_honeydew_extra["labels"] = attr_labels + + all_ext = list(attr_osi_meta.get("custom_extensions") or []) + if attr_honeydew_extra: + all_ext.append({"vendor_name": HONEYDEW_VENDOR, "data": json.dumps(attr_honeydew_extra)}) + if all_ext: + field["custom_extensions"] = all_ext + + fields.append(field) + + for calc in entity_data.get("calculated_attributes") or []: + aname = calc.get("name") or "" + if not aname or aname in seen: + continue + seen.add(aname) + + sql = (calc.get("sql") or "").strip() + datatype = calc.get("datatype") or "string" + + field = { + "name": aname, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": sql}]}, + } + dim = _honeydew_datatype_to_osi_dimension(datatype) + if dim is not None: + field["dimension"] = dim + if calc.get("description"): + cleaned = str(calc["description"]).strip() + if cleaned: + field["description"] = cleaned + + calc_osi_meta = _read_osi_metadata(calc) + calc_labels = calc.get("labels") or [] + + if calc_osi_meta.get("ai_context"): + field["ai_context"] = calc_osi_meta["ai_context"] + elif calc_labels and "label" not in calc_osi_meta: + field["ai_context"] = {"synonyms": list(calc_labels)} + + if "label" in calc_osi_meta: + field["label"] = calc_osi_meta["label"] + elif calc_labels and not calc_osi_meta.get("ai_context"): + field["label"] = calc_labels[0] + + calc_honeydew_extra = { + k: calc[k] for k in ("display_name", "hidden", "folder", "format_string", "timegrain") + if k in calc + } + if datatype == "bool": + calc_honeydew_extra["datatype"] = datatype + + all_calc_ext = list(calc_osi_meta.get("custom_extensions") or []) + # Always mark as calculated_attribute so OSI → Honeydew routes it correctly + all_calc_ext.append({ + "vendor_name": HONEYDEW_VENDOR, + "data": json.dumps(dict({"type": "calculated_attribute", "entity": entity_name}, **calc_honeydew_extra)), + }) + field["custom_extensions"] = all_calc_ext + + fields.append(field) + + if fields: + ds["fields"] = fields + + return ds + + +def _honeydew_datatype_to_osi_dimension(datatype: str) -> dict[str, Any] | None: + dt = (datatype or "").lower() + if dt in ("date", "timestamp", "time"): + return {"is_time": True} + if dt in ("bool", "string"): + return {"is_time": False} + return None # number / float → OSI fact (no dimension key) + + +def _honeydew_relation_to_osi( + rel: dict[str, Any], + entity_name: str, + seen: set[tuple], +) -> dict[str, Any] | None: + target = rel.get("target_entity") + if not target: + warnings.warn(f"Entity '{entity_name}': relation missing target_entity, skipping") + return None + + rel_type = (rel.get("rel_type") or "many-to-one").lower() + connection = rel.get("connection") or [] + connection_expr = rel.get("connection_expr") + + if rel_type == "many-to-one": + from_entity, to_entity = entity_name, target + from_cols = [c.get("src_field", "") for c in connection] + to_cols = [c.get("target_field", "") for c in connection] + elif rel_type == "one-to-many": + from_entity, to_entity = target, entity_name + from_cols = [c.get("target_field", "") for c in connection] + to_cols = [c.get("src_field", "") for c in connection] + else: + from_entity, to_entity = entity_name, target + from_cols = [c.get("src_field", "") for c in connection] + to_cols = [c.get("target_field", "") for c in connection] + + dedup_key = (from_entity, to_entity, tuple(from_cols), tuple(to_cols)) + if dedup_key in seen: + return None + seen.add(dedup_key) + + rel_name = rel.get("name") or f"{from_entity}_to_{to_entity}" + + osi_rel: dict[str, Any] = {"name": rel_name, "from": from_entity, "to": to_entity} + if from_cols: + osi_rel["from_columns"] = from_cols + osi_rel["to_columns"] = to_cols + + if connection_expr and not connection: + sql_expr = (connection_expr.get("sql") or "") if isinstance(connection_expr, dict) else str(connection_expr) + osi_rel["custom_extensions"] = [ + {"vendor_name": HONEYDEW_VENDOR, "data": json.dumps({"connection_expr": sql_expr})} + ] + + return osi_rel + + +def _honeydew_metric_to_osi(metric: dict[str, Any], entity_name: str) -> dict[str, Any] | None: + mname = metric.get("name") or "" + if not mname: + return None + + sql = (metric.get("sql") or "").strip() + if not sql: + warnings.warn(f"Metric '{mname}' in entity '{entity_name}' has no SQL, skipping") + return None + + osi_m: dict[str, Any] = { + "name": mname, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": sql}]}, + "custom_extensions": [ + {"vendor_name": HONEYDEW_VENDOR, "data": json.dumps({"entity": entity_name})} + ], + } + + if metric.get("description"): + cleaned = str(metric["description"]).strip() + if cleaned: + osi_m["description"] = cleaned + + metric_osi_meta = _read_osi_metadata(metric) + if metric_osi_meta.get("ai_context"): + osi_m["ai_context"] = metric_osi_meta["ai_context"] + + restored_ext = metric_osi_meta.get("custom_extensions") or [] + if restored_ext: + osi_m["custom_extensions"] = osi_m["custom_extensions"] + list(restored_ext) + + return osi_m + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI metadata helpers — store/restore OSI fields in Honeydew metadata sections +# ───────────────────────────────────────────────────────────────────────────── + + +def _build_osi_metadata( + *, + ai_context: Any = None, + label: str | None = None, + unique_keys: Any = None, + custom_extensions: list | None = None, + extra_vendors: list[str] | None = None, +) -> dict[str, Any] | None: + """Build a Honeydew metadata entry that stores OSI-only fields for round-tripping.""" + items: list[dict[str, Any]] = [] + + if ai_context is not None: + val = ai_context if isinstance(ai_context, str) else json.dumps(ai_context) + items.append({"name": "ai_context", "value": val}) + if label is not None: + items.append({"name": "label", "value": label}) + if unique_keys: + items.append({"name": "unique_keys", "value": json.dumps(unique_keys)}) + if custom_extensions: + items.append({"name": "custom_extensions", "value": json.dumps(custom_extensions)}) + if extra_vendors: + items.append({"name": "vendors", "value": json.dumps(extra_vendors)}) + + if not items: + return None + return {"name": _OSI_METADATA_SECTION, "metadata": items} + + +def _read_osi_metadata(obj: dict[str, Any]) -> dict[str, Any]: + """Read OSI-preserved fields from a Honeydew object's 'osi' metadata section.""" + for section in (obj.get("metadata") or []): + if (section.get("name") or "") != _OSI_METADATA_SECTION: + continue + result: dict[str, Any] = {} + for item in (section.get("metadata") or []): + key = item.get("name") or "" + raw = item.get("value") + if key == "ai_context": + try: + result[key] = json.loads(raw) + except (json.JSONDecodeError, TypeError): + result[key] = raw + elif key == "label": + result[key] = raw + elif key in ("unique_keys", "custom_extensions", "vendors"): + try: + result[key] = json.loads(raw) + except (json.JSONDecodeError, TypeError): + warnings.warn(f"Could not parse OSI metadata field '{key}': {raw!r}") + return result + return {} + + +def _get_honeydew_extension(obj: dict[str, Any]) -> dict[str, Any]: + """Extract the HONEYDEW custom_extension data from an OSI object.""" + for ext in (obj.get("custom_extensions") or []): + if ext.get("vendor_name") == HONEYDEW_VENDOR: + try: + return json.loads(ext.get("data") or "{}") + except (json.JSONDecodeError, TypeError): + return {} + return {} + + +# ───────────────────────────────────────────────────────────────────────────── +# Utilities +# ───────────────────────────────────────────────────────────────────────────── + + +def _coerce_list(value: Any) -> list: + if value is None: + return [] + if isinstance(value, list): + return value + return [value] + + +def _dump(data: Any) -> str: + return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True) + + +def _check_safe_path(output_abs: str, rel_path: str) -> bool: + """Return True iff the resolved path stays inside output_abs.""" + full = os.path.normpath(os.path.join(output_abs, rel_path)) + return full.startswith(output_abs + os.sep) + + +# ───────────────────────────────────────────────────────────────────────────── +# CLI +# ───────────────────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Bidirectional OSI ↔ Honeydew semantic model converter" + ) + sub = parser.add_subparsers(dest="command", required=True) + + p1 = sub.add_parser("osi-to-honeydew", help="Convert OSI YAML → Honeydew workspace") + p1.add_argument("-i", "--input", required=True, help="OSI YAML input file") + p1.add_argument("-o", "--output", required=True, help="Output directory for Honeydew workspace") + + p2 = sub.add_parser("honeydew-to-osi", help="Convert Honeydew workspace → OSI YAML") + p2.add_argument("-i", "--input", required=True, help="Honeydew workspace directory") + p2.add_argument("-o", "--output", required=True, help="OSI YAML output file") + + args = parser.parse_args() + + if args.command == "osi-to-honeydew": + with open(args.input) as f: + osi_yaml = f.read() + try: + files = convert_osi_to_honeydew(osi_yaml) + except HoneydewConversionError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + output_abs = os.path.abspath(args.output) + for rel_path, content in files.items(): + if not _check_safe_path(output_abs, rel_path): + print(f"Error: refusing to write outside output directory: {rel_path}", file=sys.stderr) + sys.exit(1) + full_path = os.path.normpath(os.path.join(output_abs, rel_path)) + os.makedirs(os.path.dirname(full_path), exist_ok=True) + with open(full_path, "w") as f: + f.write(content) + print(f"Wrote {len(files)} file(s) to {args.output}") + + elif args.command == "honeydew-to-osi": + try: + osi_yaml = convert_honeydew_to_osi(args.input) + except HoneydewConversionError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + with open(args.output, "w") as f: + f.write(osi_yaml) + print(f"Converted {args.input} → {args.output}") + + +if __name__ == "__main__": + main() diff --git a/converters/honeydew/tests/test_honeydew_osi_converter.py b/converters/honeydew/tests/test_honeydew_osi_converter.py new file mode 100644 index 0000000..d9a778e --- /dev/null +++ b/converters/honeydew/tests/test_honeydew_osi_converter.py @@ -0,0 +1,1444 @@ +"""Tests for the bidirectional OSI ↔ Honeydew converter.""" + +import json +import os +import sys +import warnings +from pathlib import Path + +import pytest +import yaml + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) +from honeydew_osi_converter import ( + HoneydewConversionError, + _assign_metrics_to_entities, + _build_osi_metadata, + _check_safe_path, + _fields_to_honeydew, + _find_entity_in_expression, + _honeydew_datatype_to_osi_dimension, + _is_simple_identifier, + _osi_field_to_honeydew_datatype, + _parse_osi_source, + _pick_ansi_expression, + _read_osi_metadata, + convert_honeydew_to_osi, + convert_osi_to_honeydew, +) + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +OSI_VERSION = "0.2.0.dev0" + + +def _osi(model_dict): + return yaml.dump( + {"version": OSI_VERSION, "semantic_model": [model_dict]}, + default_flow_style=False, + sort_keys=False, + ) + + +def _minimal_osi_field(name, expr, is_dimension=True, is_time=False): + field = { + "name": name, + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": expr}]}, + } + if is_dimension: + field["dimension"] = {"is_time": is_time} + return field + + +def _minimal_model(): + return { + "name": "test_model", + "datasets": [ + { + "name": "orders", + "source": "db.schema.orders", + "primary_key": ["order_id"], + "fields": [ + _minimal_osi_field("order_id", "order_id"), + _minimal_osi_field("order_date", "order_date", is_time=True), + _minimal_osi_field("total", "total_amount", is_dimension=False), + ], + } + ], + } + + +def _write_workspace(tmp_dir, workspace_name, entities): + """Write a minimal Honeydew workspace to tmp_dir.""" + workspace_path = os.path.join(tmp_dir, "workspace.yml") + with open(workspace_path, "w") as f: + yaml.dump({"type": "workspace", "name": workspace_name}, f) + + for e in entities: + ename = e["name"] + base = os.path.join(tmp_dir, "schema", ename) + os.makedirs(os.path.join(base, "datasets"), exist_ok=True) + os.makedirs(os.path.join(base, "attributes"), exist_ok=True) + os.makedirs(os.path.join(base, "metrics"), exist_ok=True) + + entity_dict = { + "type": "entity", + "name": ename, + "keys": e.get("keys", []), + "key_dataset": e.get("key_dataset", ename), + "relations": e.get("relations", []), + } + for k in ("owner", "display_name", "hidden", "folder"): + if k in e: + entity_dict[k] = e[k] + with open(os.path.join(base, f"{ename}.yml"), "w") as f: + yaml.dump(entity_dict, f) + + ds_name = e.get("key_dataset", ename) + ds_dict = { + "type": "dataset", + "entity": ename, + "name": ds_name, + "sql": e.get("sql", "DB.SCHEMA." + ename.upper()), + "dataset_type": "table", + "attributes": e.get("dataset_attrs", []), + } + with open(os.path.join(base, "datasets", f"{ds_name}.yml"), "w") as f: + yaml.dump(ds_dict, f) + + for attr in e.get("calc_attrs", []): + with open(os.path.join(base, "attributes", f"{attr['name']}.yml"), "w") as f: + yaml.dump(attr, f) + + for m in e.get("metrics", []): + with open(os.path.join(base, "metrics", f"{m['name']}.yml"), "w") as f: + yaml.dump(m, f) + + +def _osi_roundtrip(model_dict, tmp_path): + """OSI → Honeydew → OSI; returns the semantic model dict.""" + files = convert_osi_to_honeydew(_osi(model_dict)) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + return yaml.safe_load(convert_honeydew_to_osi(str(tmp_path)))["semantic_model"][0] + + +def _honeydew_roundtrip(entities, tmp_path): + """Honeydew → OSI → Honeydew; returns Path to the output workspace directory.""" + _write_workspace(str(tmp_path), "ws", entities) + osi_yaml = convert_honeydew_to_osi(str(tmp_path)) + files = convert_osi_to_honeydew(osi_yaml) + out_dir = tmp_path / "out" + for rel_path, content in files.items(): + p = out_dir / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + return out_dir + + +# ───────────────────────────────────────────────────────────────────────────── +# Unit tests – helpers +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("expr,expected", [ + ("order_id", True), + ("SUM(x)", False), + ("orders.id", False), + ("1col", False), + ("_hidden", True), +]) +def test_is_simple_identifier(expr, expected): + assert _is_simple_identifier(expr) is expected + + +@pytest.mark.parametrize("source,expected_sql,expected_type", [ + ("db.schema.table", "db.schema.table", "table"), + ("SELECT id FROM foo", "SELECT id FROM foo", "sql"), + ("WITH cte AS (SELECT 1) SELECT * FROM cte", "WITH cte AS (SELECT 1) SELECT * FROM cte", "sql"), + ("", "", "table"), +]) +def test_parse_osi_source(source, expected_sql, expected_type): + sql, dtype = _parse_osi_source(source) + assert sql == expected_sql and dtype == expected_type + + +@pytest.mark.parametrize("field,expected_dt", [ + ({"dimension": {"is_time": True}}, "timestamp"), + ({"dimension": {"is_time": False}}, "string"), + ({}, "number"), +]) +def test_osi_field_to_honeydew_datatype(field, expected_dt): + assert _osi_field_to_honeydew_datatype(field) == expected_dt + + +@pytest.mark.parametrize("datatype,expected_dim", [ + ("date", {"is_time": True}), + ("timestamp", {"is_time": True}), + ("string", {"is_time": False}), + ("bool", {"is_time": False}), + ("number", None), + ("float", None), +]) +def test_honeydew_datatype_to_osi_dimension(datatype, expected_dim): + assert _honeydew_datatype_to_osi_dimension(datatype) == expected_dim + + +@pytest.mark.parametrize("expr,entities,expected", [ + ("SUM(orders.total)", {"orders", "customers"}, "orders"), + ("orders.a / customers.b", {"orders", "customers"}, "orders"), + ("COUNT(*)", {"orders"}, None), + ("SUM(foo.col)", {"orders"}, None), +]) +def test_find_entity_in_expression(expr, entities, expected): + assert _find_entity_in_expression(expr, entities) == expected + + +def test_pick_ansi_expression_ansi_preferred(): + expr = {"dialects": [ + {"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}, + {"dialect": "ANSI_SQL", "expression": "col"}, + ]} + assert _pick_ansi_expression(expr, "f") == "col" + + +def test_pick_ansi_expression_fallback_warns(): + expr = {"dialects": [{"dialect": "SNOWFLAKE", "expression": "col::VARCHAR"}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = _pick_ansi_expression(expr, "f") + assert result == "col::VARCHAR" + assert any("ANSI_SQL" in str(x.message) for x in w) + + +@pytest.mark.parametrize("expression", [None, {"dialects": []}]) +def test_pick_ansi_expression_returns_none(expression): + assert _pick_ansi_expression(expression, "f") is None + + +def test_pick_ansi_expression_non_dict_warns(): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = _pick_ansi_expression("just_a_string", "f") + assert result is None + assert any("must be a mapping" in str(x.message) for x in w) + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI metadata helpers +# ───────────────────────────────────────────────────────────────────────────── + +def test_build_and_read_ai_context_string(): + section = _build_osi_metadata(ai_context="orders, purchases") + result = _read_osi_metadata({"metadata": [section]}) + assert result["ai_context"] == "orders, purchases" + + +def test_build_and_read_ai_context_dict(): + ctx = {"instructions": "Use for sales", "synonyms": ["orders", "purchases"]} + section = _build_osi_metadata(ai_context=ctx) + result = _read_osi_metadata({"metadata": [section]}) + assert result["ai_context"] == ctx + + +def test_build_and_read_unique_keys(): + uks = [["col1", "col2"], ["col3"]] + section = _build_osi_metadata(unique_keys=uks) + result = _read_osi_metadata({"metadata": [section]}) + assert result["unique_keys"] == uks + + +def test_build_and_read_custom_extensions(): + exts = [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}] + section = _build_osi_metadata(custom_extensions=exts) + result = _read_osi_metadata({"metadata": [section]}) + assert result["custom_extensions"] == exts + + +def test_read_osi_metadata_no_osi_section(): + assert _read_osi_metadata({"metadata": [{"name": "other", "metadata": []}]}) == {} + + +def test_read_osi_metadata_no_metadata(): + assert _read_osi_metadata({}) == {} + + +def test_build_osi_metadata_nothing_to_store(): + assert _build_osi_metadata() is None + + +# ───────────────────────────────────────────────────────────────────────────── +# Assign metrics to entities +# ───────────────────────────────────────────────────────────────────────────── + +def test_assign_metrics_by_expression(): + metrics = [{"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}] + result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) + assert "total" in [m["name"] for m in result.get("orders", [])] + + +def test_assign_metrics_honeydew_hint_takes_priority(): + metrics = [{ + "name": "cnt", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], + }] + result = _assign_metrics_to_entities(metrics, ["orders", "customers"]) + assert "cnt" in [m["name"] for m in result.get("customers", [])] + assert "orders" not in result + + +def test_assign_metrics_fallback_to_first_entity(): + metrics = [{"name": "cnt", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] + with warnings.catch_warnings(record=True): + result = _assign_metrics_to_entities(metrics, ["orders"]) + assert "cnt" in [m["name"] for m in result.get("orders", [])] + + +def test_assign_metrics_no_entities(): + metrics = [{"name": "m", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "COUNT(*)"}]}}] + with warnings.catch_warnings(record=True): + result = _assign_metrics_to_entities(metrics, []) + assert result == {} + + +# ───────────────────────────────────────────────────────────────────────────── +# Path traversal guard +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("rel_path,expected", [ + ("workspace.yml", True), + ("schema/orders/orders.yml", True), + ("schema/orders/datasets/orders.yml", True), + ("../evil.yml", False), + ("../../etc/passwd", False), + ("schema/../../../evil", False), +]) +def test_check_safe_path(rel_path, expected): + output_abs = os.path.abspath("/tmp/test_output") + assert _check_safe_path(output_abs, rel_path) is expected + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI → Honeydew: file content +# ───────────────────────────────────────────────────────────────────────────── + +_REL_MODEL = { + "name": "m", + "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], + "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}], +} + +@pytest.mark.parametrize("model,path,expected", [ + # ── minimal model ────────────────────────────────────────────────────────── + pytest.param( + _minimal_model(), + "workspace.yml", + {"type": "workspace", "name": "test_model"}, + id="minimal-workspace", + ), + pytest.param( + _minimal_model(), + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["order_id"], + "key_dataset": "orders", "relations": []}, + id="minimal-entity", + ), + pytest.param( + _minimal_model(), + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.schema.orders", "dataset_type": "table", + "attributes": [ + {"column": "order_id", "name": "order_id", "datatype": "string"}, + {"column": "order_date", "name": "order_date", "datatype": "timestamp"}, + {"column": "total_amount", "name": "total", "datatype": "number"}, + ], + }, + id="minimal-dataset", + ), + # ── complex expression → calculated attribute ────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "disc_price", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, + "dimension": {"is_time": False}, + }]}]}, + "schema/orders/attributes/disc_price.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc_price", + "datatype": "string", "sql": "price * (1 - discount)"}, + id="calc-attr-file", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "disc_price", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * (1 - discount)"}]}, + "dimension": {"is_time": False}, + }]}]}, + "schema/orders/datasets/orders.yml", + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", "attributes": []}, + id="calc-attr-dataset-empty", + ), + # ── label → labels in attr ──────────────────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "status", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}, + "label": "sales", + }]}]}, + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", + "datatype": "string", "labels": ["sales"], + "metadata": [{"name": "osi", "metadata": [ + {"name": "label", "value": "sales"} + ]}]}], + }, + id="label-in-attr", + ), + # ── ai_context string → description merged ──────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "description": "Base desc", + "ai_context": "revenue, earnings", + }]}]}, + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", + "attributes": [{"column": "total", "name": "total", "datatype": "number", + "description": "Base desc\nrevenue, earnings"}], + }, + id="ai-context-string", + ), + # ── ai_context dict → labels + description + metadata ──────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "total", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "total"}]}, + "ai_context": {"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}, + }]}]}, + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", + "attributes": [{ + "column": "total", "name": "total", "datatype": "number", + "description": "Use for revenue", + "labels": ["rev", "earnings"], + "metadata": [{"name": "osi", "metadata": [ + {"name": "ai_context", + "value": '{"instructions": "Use for revenue", "synonyms": ["rev", "earnings"]}'}, + ]}], + }], + }, + id="ai-context-dict", + ), + # ── unique_keys → entity metadata ───────────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["item_id"], + "unique_keys": [["sku"], ["item_id", "variant"]], + "fields": []}]}, + "schema/items/items.yml", + { + "type": "entity", "name": "items", "keys": ["item_id"], + "key_dataset": "items", "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "unique_keys", "value": '[["sku"], ["item_id", "variant"]]'}, + ]}], + }, + id="unique-keys", + ), + # ── non-HONEYDEW custom_extensions → entity metadata ────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]}, + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "key_dataset": "orders", "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "custom_extensions", + "value": '[{"vendor_name": "SNOWFLAKE", "data": "{\\"warehouse\\": \\"WH\\"}"}]'}, + ]}], + }, + id="custom-ext", + ), + # ── relationship on from-entity; nothing on to-entity ──────────────────── + pytest.param( + _REL_MODEL, + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "name": "orders_to_customers", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + }, + id="relation-from-entity", + ), + pytest.param( + _REL_MODEL, + "schema/customers/customers.yml", + {"type": "entity", "name": "customers", "key_dataset": "customers", "relations": []}, + id="relation-to-entity-empty", + ), + # ── model-level ai_context → workspace metadata ─────────────────────────── + pytest.param( + {"name": "m", "datasets": [], + "ai_context": {"instructions": "Use for retail analytics", "synonyms": ["store"]}}, + "workspace.yml", + { + "type": "workspace", "name": "m", + "metadata": [{"name": "osi", "metadata": [ + {"name": "ai_context", + "value": '{"instructions": "Use for retail analytics", "synonyms": ["store"]}'}, + ]}], + }, + id="model-ai-context", + ), + # ── metric ──────────────────────────────────────────────────────────────── + pytest.param( + {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total_rev", "description": "Sum of sales", + "expression": {"dialects": [{"dialect": "ANSI_SQL", + "expression": "SUM(orders.total)"}]}}]}, + "schema/orders/metrics/total_rev.yml", + {"type": "metric", "entity": "orders", "name": "total_rev", + "datatype": "number", "sql": "SUM(orders.total)", "description": "Sum of sales"}, + id="metric", + ), + # ── subquery source → dataset_type sql ─────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "orders", + "source": "SELECT * FROM raw.orders WHERE active = true", "fields": []}]}, + "schema/orders/datasets/orders.yml", + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "SELECT * FROM raw.orders WHERE active = true", + "dataset_type": "sql", "attributes": []}, + id="subquery-source", + ), + # ── composite primary key ───────────────────────────────────────────────── + pytest.param( + {"name": "m", "datasets": [{"name": "li", "source": "db.s.li", + "primary_key": ["order_id", "line_number"], "fields": []}]}, + "schema/li/li.yml", + {"type": "entity", "name": "li", "keys": ["order_id", "line_number"], + "key_dataset": "li", "relations": []}, + id="composite-pk", + ), +]) +def test_osi_to_honeydew_file_content(model, path, expected): + files = convert_osi_to_honeydew(_osi(model)) + assert path in files + assert yaml.safe_load(files[path]) == expected + + +def test_osi_to_honeydew_metric_entity_hint_overrides_expression(): + model = {"name": "m", + "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], + "metrics": [{ + "name": "cnt", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.x)"}]}, + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "customers"}'}], + }]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/customers/metrics/cnt.yml" in files + assert "schema/orders/metrics/cnt.yml" not in files + + +def test_osi_to_honeydew_invalid_version_raises(): + with pytest.raises(HoneydewConversionError, match="Unsupported"): + convert_osi_to_honeydew("version: '9.9.9'\nsemantic_model:\n - name: m\n") + + +def test_osi_to_honeydew_missing_semantic_model_raises(): + with pytest.raises(HoneydewConversionError): + convert_osi_to_honeydew(f"version: '{OSI_VERSION}'\n") + + +def test_osi_to_honeydew_multiple_models_warns(): + doc = yaml.dump({"version": OSI_VERSION, "semantic_model": [ + {"name": "m1", "datasets": []}, + {"name": "m2", "datasets": []}, + ]}) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(doc) + assert any("only the first" in str(x.message) for x in w) + assert yaml.safe_load(files["workspace.yml"]) == {"type": "workspace", "name": "m1"} + + +# ───────────────────────────────────────────────────────────────────────────── +# Honeydew → OSI: full document +# ───────────────────────────────────────────────────────────────────────────── + +def _hd_root(sm): + return {"version": OSI_VERSION, "vendors": ["HONEYDEW"], "semantic_model": [sm]} + + +def _ansi(expr): + return {"dialects": [{"dialect": "ANSI_SQL", "expression": expr}]} + + +@pytest.mark.parametrize("ws_name,entities,expected_root", [ + # ── basic entity with two dataset attributes ────────────────────────────── + pytest.param( + "tpch", + [{"name": "orders", "keys": ["orderkey"], "key_dataset": "tpch_orders", + "sql": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", + "dataset_attrs": [ + {"column": "o_orderkey", "name": "orderkey", "datatype": "number"}, + {"column": "o_orderdate", "name": "orderdate", "datatype": "date"}, + ]}], + _hd_root({ + "name": "tpch", + "datasets": [{ + "name": "orders", + "source": "SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS", + "primary_key": ["orderkey"], + "fields": [ + {"name": "orderkey", "expression": _ansi("o_orderkey")}, + {"name": "orderdate", "expression": _ansi("o_orderdate"), + "dimension": {"is_time": True}}, + ], + }], + }), + id="basic", + ), + # ── field types ─────────────────────────────────────────────────────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "id", "name": "id", "datatype": "number"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{"name": "id", "expression": _ansi("id")}], + }]}), + id="field-number", + ), + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{"name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}}], + }]}), + id="field-string", + ), + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "created_at", "name": "created_at", "datatype": "timestamp"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{"name": "created_at", "expression": _ansi("created_at"), + "dimension": {"is_time": True}}], + }]}), + id="field-timestamp", + ), + # ── labels → label + ai_context + custom_extension ─────────────────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "labels": ["sales", "reporting"]}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{ + "name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}, + "ai_context": {"synonyms": ["sales", "reporting"]}, + "label": "sales", + "custom_extensions": [ + {"vendor_name": "HONEYDEW", "data": '{"labels": ["sales", "reporting"]}'}, + ], + }], + }]}), + id="labels", + ), + # ── many-to-one relationship ────────────────────────────────────────────── + pytest.param( + "ws", + [ + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "customer_id", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "db.s.customers", "dataset_attrs": []}, + ], + _hd_root({ + "name": "ws", + "datasets": [ + {"name": "customers", "source": "db.s.customers", "primary_key": ["id"]}, + {"name": "orders", "source": "db.s.orders", "primary_key": ["order_id"]}, + ], + "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["customer_id"], "to_columns": ["id"]}], + }), + id="many-to-one", + ), + # ── one-to-many (direction flipped) ────────────────────────────────────── + pytest.param( + "ws", + [ + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "customer_id"}]}], + "dataset_attrs": []}, + {"name": "orders", "keys": ["order_id"], "key_dataset": "orders", + "sql": "db.s.orders", "dataset_attrs": []}, + ], + _hd_root({ + "name": "ws", + "datasets": [ + {"name": "customers", "source": "db.s.customers", "primary_key": ["id"]}, + {"name": "orders", "source": "db.s.orders", "primary_key": ["order_id"]}, + ], + "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["customer_id"], "to_columns": ["id"]}], + }), + id="one-to-many-flipped", + ), + # ── metric ──────────────────────────────────────────────────────────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "count", + "datatype": "number", "sql": "COUNT(*)"}]}], + _hd_root({"name": "ws", "datasets": [ + {"name": "orders", "source": "db.s.orders", "primary_key": ["id"]}, + ], "metrics": [{ + "name": "count", + "expression": _ansi("COUNT(*)"), + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "orders"}'}], + }]}), + id="metric", + ), + # ── calculated attribute → OSI field with HONEYDEW extension ───────────── + pytest.param( + "ws", + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "discounted", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}]}], + _hd_root({"name": "ws", "datasets": [{ + "name": "orders", "source": "db.s.orders", "primary_key": ["id"], + "fields": [{ + "name": "discounted", + "expression": _ansi("orders.price * (1 - orders.discount)"), + "custom_extensions": [ + {"vendor_name": "HONEYDEW", + "data": '{"type": "calculated_attribute", "entity": "orders"}'}, + ], + }], + }]}), + id="calc-attr", + ), +]) +def test_honeydew_to_osi_output(tmp_path, ws_name, entities, expected_root): + _write_workspace(str(tmp_path), ws_name, entities) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert result == expected_root + + +def test_honeydew_to_osi_missing_workspace_raises(tmp_path): + with pytest.raises(HoneydewConversionError, match="workspace.yml"): + convert_honeydew_to_osi(str(tmp_path)) + + +def test_honeydew_to_osi_missing_schema_dir_empty_model(tmp_path): + (tmp_path / "workspace.yml").write_text(yaml.dump({"type": "workspace", "name": "ws"})) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert result == {"version": OSI_VERSION, "vendors": ["HONEYDEW"], + "semantic_model": [{"name": "ws", "datasets": []}]} + + +def test_honeydew_to_osi_empty_metric_sql_skipped(tmp_path): + _write_workspace(str(tmp_path), "ws", [{"name": "orders", "keys": ["id"], + "key_dataset": "orders", "sql": "db.s.orders", "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "bad", + "datatype": "number", "sql": ""}]}]) + with warnings.catch_warnings(record=True): + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert "metrics" not in result["semantic_model"][0] + + +def test_honeydew_to_osi_duplicate_relations_deduplicated(tmp_path): + _write_workspace(str(tmp_path), "ws", [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "db.s.orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", "sql": "db.s.customers", + "relations": [{"target_entity": "orders", "rel_type": "one-to-many", + "connection": [{"src_field": "id", "target_field": "cid"}]}], + "dataset_attrs": []}, + ]) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert len(result["semantic_model"][0].get("relationships", [])) == 1 + + +# ───────────────────────────────────────────────────────────────────────────── +# OSI → Honeydew → OSI round-trip: full semantic model +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("model,expected_sm", [ + pytest.param( + {"name": "retail", "description": "Retail model", "datasets": []}, + {"name": "retail", "datasets": [], "description": "Retail model"}, + id="name-desc", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id"], "fields": []}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id"]}]}, + id="pk-single", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id", "line_no"], "fields": []}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "primary_key": ["order_id", "line_no"]}]}, + id="pk-composite", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["id"], + "unique_keys": [["sku"], ["id", "variant"]], + "fields": []}]}, + {"name": "m", "datasets": [{"name": "items", "source": "db.s.items", + "primary_key": ["id"], + "unique_keys": [["sku"], ["id", "variant"]]}]}, + id="unique-keys", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "label": "sales", + "expression": _ansi("status"), + "dimension": {"is_time": False}}]}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}, + "label": "sales"}]}]}, + id="field-label", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "total", + "expression": _ansi("total"), + "ai_context": {"instructions": "Use for revenue analysis", + "synonyms": ["revenue", "sales"]}}]}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "total", "expression": _ansi("total"), + "description": "Use for revenue analysis", + "ai_context": {"instructions": "Use for revenue analysis", + "synonyms": ["revenue", "sales"]}, + "custom_extensions": [ + {"vendor_name": "HONEYDEW", + "data": '{"labels": ["revenue", "sales"]}'}, + ]}]}]}, + id="ai-context-dict", + ), + pytest.param( + {"name": "m", "ai_context": {"instructions": "Retail analytics", "synonyms": ["store"]}, + "datasets": []}, + {"name": "m", "datasets": [], + "ai_context": {"instructions": "Retail analytics", "synonyms": ["store"]}}, + id="model-ai-context", + ), + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}], + "fields": []}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "custom_extensions": [{"vendor_name": "SNOWFLAKE", "data": '{"warehouse": "WH"}'}]}]}, + id="custom-ext", + ), + pytest.param( + {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]}, + {"name": "m", "datasets": [ + {"name": "customers", "source": "db.s.customers"}, + {"name": "orders", "source": "db.s.orders"}, + ], "relationships": [{"name": "orders_to_customers", "from": "orders", "to": "customers", + "from_columns": ["cid"], "to_columns": ["id"]}]}, + id="relationship", + ), + pytest.param( + {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "total_revenue", "description": "Sum of sales", + "expression": _ansi("SUM(orders.total)")}]}, + {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders"}], + "metrics": [{"name": "total_revenue", + "expression": _ansi("SUM(orders.total)"), + "custom_extensions": [ + {"vendor_name": "HONEYDEW", "data": '{"entity": "orders"}'}, + ], + "description": "Sum of sales"}]}, + id="metric", + ), + # ai_context string is merged into description and not stored in metadata for fields + pytest.param( + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", + "expression": _ansi("status"), + "ai_context": "order status, order state", + "dimension": {"is_time": False}}]}]}, + {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", + "fields": [{"name": "status", "expression": _ansi("status"), + "dimension": {"is_time": False}, + "description": "order status, order state"}]}]}, + id="ai-context-string-becomes-desc", + ), +]) +def test_osi_roundtrip_sm(tmp_path, model, expected_sm): + assert _osi_roundtrip(model, tmp_path) == expected_sm + + +def test_osi_roundtrip_tpcds_example(tmp_path): + tpcds_path = ( + Path(__file__).resolve().parent.parent.parent.parent + / "examples" / "tpcds_semantic_model.yaml" + ) + if not tpcds_path.exists(): + pytest.skip("TPC-DS example not found") + osi_yaml = tpcds_path.read_text() + files = convert_osi_to_honeydew(osi_yaml) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + sm = result["semantic_model"][0] + assert sm["name"] == "tpcds_retail_model" + ds_names = {ds["name"] for ds in sm["datasets"]} + assert "store_sales" in ds_names and "customer" in ds_names + + +# ───────────────────────────────────────────────────────────────────────────── +# Honeydew → OSI → Honeydew round-trip: full file content +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("entities,path,expected", [ + # ── entity name + keys ──────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["order_id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": []}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["order_id"], + "key_dataset": "orders", "relations": []}, + id="entity-keys", + ), + # ── dataset source ──────────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.SCHEMA.ORDERS", "dataset_attrs": []}], + "schema/orders/datasets/orders.yml", + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.SCHEMA.ORDERS", "dataset_type": "table", "attributes": []}, + id="dataset-source", + ), + # ── column attributes ───────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [ + {"column": "o_id", "name": "id", "datatype": "number"}, + {"column": "o_status", "name": "status", "datatype": "string"}, + ]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [ + {"column": "o_id", "name": "id", "datatype": "number"}, + {"column": "o_status", "name": "status", "datatype": "string"}, + ], + }, + id="column-attrs", + ), + # ── labels on column ────────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "labels": ["sales"]}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{ + "column": "status", "name": "status", "datatype": "string", + "labels": ["sales"], + "metadata": [{"name": "osi", "metadata": [ + {"name": "ai_context", "value": '{"synonyms": ["sales"]}'}, + ]}], + }], + }, + id="labels-on-column", + ), + # ── calculated attribute sql ─────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * (1 - orders.discount)"}]}], + "schema/orders/attributes/disc.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "orders.price * (1 - orders.discount)"}, + id="calc-attr-sql", + ), + # ── calc attr with simple identifier stays as calc_attr ────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "revenue", "datatype": "number", "sql": "revenue"}]}], + "schema/orders/attributes/revenue.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "revenue", + "datatype": "number", "sql": "revenue"}, + id="calc-simple-stays-calc", + ), + # ── metric entity assignment ────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "metrics": [{"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}]}], + "schema/orders/metrics/cnt.yml", + {"type": "metric", "entity": "orders", "name": "cnt", + "datatype": "number", "sql": "COUNT(*)"}, + id="metric", + ), + # ── many-to-one relation ────────────────────────────────────────────────── + pytest.param( + [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "keys": ["id"], + "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "name": "orders_to_customers", + "connection": [{"src_field": "cid", "target_field": "id"}]}], + }, + id="relation", + ), + # ── connection_expr round-trip ──────────────────────────────────────────── + pytest.param( + [ + {"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], + "dataset_attrs": []}, + {"name": "customers", "keys": ["id"], "key_dataset": "customers", + "sql": "DB.S.CUSTOMERS", "dataset_attrs": []}, + ], + "schema/orders/orders.yml", + { + "type": "entity", "name": "orders", "keys": ["id"], + "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", + "name": "orders_to_customers", + "connection_expr": {"sql": "orders.cid = customers.id AND orders.region = customers.region"}}], + }, + id="connection-expr", + ), + # ── bool datatype ───────────────────────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "is_active", "name": "is_active", "datatype": "bool"}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "is_active", "name": "is_active", "datatype": "bool"}], + }, + id="bool-datatype", + ), + # ── Honeydew-specific attribute fields ─────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "display_name": "Order Status"}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", "datatype": "string", + "display_name": "Order Status"}], + }, + id="attr-display-name", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "hidden": True}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", "datatype": "string", + "hidden": True}], + }, + id="attr-hidden", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [{"column": "status", "name": "status", "datatype": "string", + "format_string": "##,###"}]}], + "schema/orders/datasets/orders.yml", + { + "type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", + "attributes": [{"column": "status", "name": "status", "datatype": "string", + "format_string": "##,###"}], + }, + id="attr-format-string", + ), + # ── Honeydew-specific calc attr fields ──────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * 0.9", "display_name": "Discounted Price"}]}], + "schema/orders/attributes/disc.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "orders.price * 0.9", "display_name": "Discounted Price"}, + id="calc-display-name", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], + "calc_attrs": [{"type": "calculated_attribute", "entity": "orders", + "name": "disc", "datatype": "number", + "sql": "orders.price * 0.9", "timegrain": "day"}]}], + "schema/orders/attributes/disc.yml", + {"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "orders.price * 0.9", "timegrain": "day"}, + id="calc-timegrain", + ), + # ── Honeydew-specific entity fields ────────────────────────────────────── + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "owner": "analytics_team"}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "owner": "analytics_team", "relations": []}, + id="entity-owner", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "display_name": "Orders Table"}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "display_name": "Orders Table", "relations": []}, + id="entity-display-name", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "hidden": True}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "hidden": True, "relations": []}, + id="entity-hidden", + ), + pytest.param( + [{"name": "orders", "keys": ["id"], "key_dataset": "orders", "sql": "DB.S.ORDERS", + "dataset_attrs": [], "folder": "finance"}], + "schema/orders/orders.yml", + {"type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "folder": "finance", "relations": []}, + id="entity-folder", + ), +]) +def test_honeydew_roundtrip_file(tmp_path, entities, path, expected): + out_dir = _honeydew_roundtrip(entities, tmp_path) + p = out_dir / path + assert p.exists(), f"Expected file {path!r} was not generated" + assert yaml.safe_load(p.read_text()) == expected + + +# ───────────────────────────────────────────────────────────────────────────── +# Bug-fix regression tests +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("expression", [ + {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}, + {"dialects": [{"dialect": "ANSI_SQL", "expression": " "}]}, +]) +def test_empty_or_whitespace_field_expression_skipped(expression): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": expression, + "dimension": {"is_time": False}, + }]}]} + files = convert_osi_to_honeydew(_osi(model)) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert ds == {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", "attributes": []} + assert "schema/orders/attributes/bad.yml" not in files + + +@pytest.mark.parametrize("expression", [ + {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}, + {"dialects": [{"dialect": "ANSI_SQL", "expression": " "}]}, +]) +def test_empty_or_whitespace_metric_expression_skipped(expression): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "bad_m", "expression": expression}]} + files = convert_osi_to_honeydew(_osi(model)) + assert "schema/orders/metrics/bad_m.yml" not in files + + +def test_non_dict_expression_warns(): + model = {"name": "m", "datasets": [{"name": "orders", "source": "db.s.orders", "fields": [{ + "name": "bad", + "expression": "just_a_string", + "dimension": {"is_time": False}, + }]}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(_osi(model)) + assert any("must be a mapping" in str(x.message) for x in w) + ds = yaml.safe_load(files["schema/orders/datasets/orders.yml"]) + assert ds == {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "db.s.orders", "dataset_type": "table", "attributes": []} + + +def test_duplicate_metric_name_warns(): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [ + {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.a)"}]}}, + {"name": "total", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.b)"}]}}, + ]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(_osi(model)) + assert any("total" in str(x.message) for x in w) + assert yaml.safe_load(files["schema/orders/metrics/total.yml"]) == { + "type": "metric", "entity": "orders", "name": "total", + "datatype": "number", "sql": "SUM(orders.b)", + } + + +def test_metric_string_ai_context_preserved_in_roundtrip(tmp_path): + model = {"name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders", "fields": []}], + "metrics": [{"name": "rev", "ai_context": "Use for revenue analysis", + "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "SUM(orders.total)"}]}}]} + sm = _osi_roundtrip(model, tmp_path) + assert sm == { + "name": "m", + "datasets": [{"name": "orders", "source": "db.s.orders"}], + "metrics": [{ + "name": "rev", + "expression": _ansi("SUM(orders.total)"), + "custom_extensions": [{"vendor_name": "HONEYDEW", "data": '{"entity": "orders"}'}], + "description": "Use for revenue analysis", + "ai_context": "Use for revenue analysis", + }], + } + + +def test_malformed_osi_metadata_json_warns(tmp_path): + ws_path = tmp_path / "workspace.yml" + ws_path.write_text(yaml.dump({"type": "workspace", "name": "ws"})) + base = tmp_path / "schema" / "orders" + (base / "datasets").mkdir(parents=True) + entity = { + "type": "entity", "name": "orders", "keys": ["id"], "key_dataset": "orders", + "relations": [], + "metadata": [{"name": "osi", "metadata": [ + {"name": "unique_keys", "value": "[broken json"}, + ]}], + } + (base / "orders.yml").write_text(yaml.dump(entity)) + (base / "datasets" / "orders.yml").write_text(yaml.dump( + {"type": "dataset", "entity": "orders", "name": "orders", + "sql": "DB.S.ORDERS", "dataset_type": "table", "attributes": []} + )) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + convert_honeydew_to_osi(str(tmp_path)) + assert any("unique_keys" in str(x.message) for x in w) + + +# ───────────────────────────────────────────────────────────────────────────── +# _fields_to_honeydew unit tests +# ───────────────────────────────────────────────────────────────────────────── + +def test_fields_to_honeydew_simple_identifier_goes_to_dataset(): + fields = [{"name": "status", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "status"}]}, + "dimension": {"is_time": False}}] + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") + assert dataset_attrs == [{"column": "status", "name": "status", "datatype": "string"}] + assert calc_attrs == [] + + +def test_fields_to_honeydew_complex_sql_goes_to_calc(): + fields = [{"name": "disc", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "price * 0.9"}]}}] + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") + assert dataset_attrs == [] + assert calc_attrs == [{"type": "calculated_attribute", "entity": "orders", "name": "disc", + "datatype": "number", "sql": "price * 0.9"}] + + +def test_fields_to_honeydew_missing_name_raises(): + fields = [{"expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": "col"}]}}] + with pytest.raises(HoneydewConversionError, match="missing 'name'"): + _fields_to_honeydew(fields, "orders") + + +def test_fields_to_honeydew_empty_expression_skipped(): + fields = [{"name": "bad", "expression": {"dialects": [{"dialect": "ANSI_SQL", "expression": ""}]}}] + dataset_attrs, calc_attrs = _fields_to_honeydew(fields, "orders") + assert dataset_attrs == [] and calc_attrs == [] + + +# ───────────────────────────────────────────────────────────────────────────── +# Connectionless relation warning +# ───────────────────────────────────────────────────────────────────────────── + +def test_connectionless_relation_warns(): + model = {"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []}, + {"name": "customers", "source": "db.s.customers", "fields": []}, + ], "relationships": [{"name": "r", "from": "orders", "to": "customers"}]} + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + files = convert_osi_to_honeydew(_osi(model)) + assert any("resolve the join" in str(x.message) for x in w) + assert yaml.safe_load(files["schema/orders/orders.yml"]) == { + "type": "entity", "name": "orders", "key_dataset": "orders", + "relations": [{"target_entity": "customers", "rel_type": "many-to-one", "name": "r"}], + } + + +# ───────────────────────────────────────────────────────────────────────────── +# Vendors round-trip +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("input_vendors,expected_vendors", [ + (["SNOWFLAKE", "HONEYDEW"], ["HONEYDEW", "SNOWFLAKE"]), + (["SNOWFLAKE"], ["HONEYDEW", "SNOWFLAKE"]), + (["HONEYDEW"], ["HONEYDEW"]), +]) +def test_vendors_roundtrip(tmp_path, input_vendors, expected_vendors): + doc = yaml.dump({ + "version": OSI_VERSION, + "vendors": input_vendors, + "semantic_model": [{"name": "m", "datasets": []}], + }) + files = convert_osi_to_honeydew(doc) + for rel_path, content in files.items(): + p = tmp_path / rel_path + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + result = yaml.safe_load(convert_honeydew_to_osi(str(tmp_path))) + assert result == {"version": OSI_VERSION, "vendors": expected_vendors, + "semantic_model": [{"name": "m", "datasets": []}]} + + +# ───────────────────────────────────────────────────────────────────────────── +# main() CLI smoke tests +# ───────────────────────────────────────────────────────────────────────────── + +def test_main_osi_to_honeydew(tmp_path): + import subprocess + input_file = tmp_path / "model.yaml" + input_file.write_text(yaml.dump({ + "version": OSI_VERSION, + "semantic_model": [{"name": "m", "datasets": [ + {"name": "orders", "source": "db.s.orders", "fields": []} + ]}], + })) + output_dir = tmp_path / "out" + result = subprocess.run( + [sys.executable, str(Path(__file__).resolve().parent.parent / "src" / "honeydew_osi_converter.py"), + "osi-to-honeydew", "-i", str(input_file), "-o", str(output_dir)], + capture_output=True, text=True, + ) + assert result.returncode == 0 + assert yaml.safe_load((output_dir / "workspace.yml").read_text()) == { + "type": "workspace", "name": "m", + } + + +def test_main_honeydew_to_osi(tmp_path): + import subprocess + _write_workspace(str(tmp_path), "ws", [{ + "name": "orders", "keys": ["id"], "key_dataset": "orders", + "sql": "DB.S.ORDERS", "dataset_attrs": [], + }]) + output_file = tmp_path / "output.yaml" + result = subprocess.run( + [sys.executable, str(Path(__file__).resolve().parent.parent / "src" / "honeydew_osi_converter.py"), + "honeydew-to-osi", "-i", str(tmp_path), "-o", str(output_file)], + capture_output=True, text=True, + ) + assert result.returncode == 0 + assert yaml.safe_load(output_file.read_text()) == { + "version": OSI_VERSION, + "vendors": ["HONEYDEW"], + "semantic_model": [{"name": "ws", "datasets": [ + {"name": "orders", "source": "DB.S.ORDERS", "primary_key": ["id"]}, + ]}], + } + + +def test_main_path_traversal_rejected(tmp_path): + import subprocess + input_file = tmp_path / "model.yaml" + input_file.write_text( + f"version: '{OSI_VERSION}'\nsemantic_model:\n" + " - name: m\n datasets:\n" + " - name: '../../evil'\n source: db.s.evil\n fields: []\n" + ) + output_dir = tmp_path / "out" + result = subprocess.run( + [sys.executable, str(Path(__file__).resolve().parent.parent / "src" / "honeydew_osi_converter.py"), + "osi-to-honeydew", "-i", str(input_file), "-o", str(output_dir)], + capture_output=True, text=True, + ) + assert result.returncode == 1 + assert "refusing to write" in result.stderr + assert not (tmp_path / "evil.yml").exists() diff --git a/core-spec/spec.md b/core-spec/spec.md index 089f56d..ac410e3 100644 --- a/core-spec/spec.md +++ b/core-spec/spec.md @@ -54,6 +54,7 @@ The following are well-known examples: | `DBT` | dbt-specific attributes | | `DATABRICKS` | Databricks-specific attributes | | `GOODDATA` | GoodData-specific attributes | +| `HONEYDEW` | Honeydew-specific attributes | ## Semantic Model