Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 95 additions & 3 deletions validation/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

Validates OSI YAML files against:
1. JSON Schema (structure, types, enums)
2. Unique names (datasets, fields, metrics, relationships)
3. Valid relationship references
4. SQL syntax (using sqlglot)
2. Unique names (semantic models, datasets, fields, metrics, relationships, dialects)
3. Valid relationship references and column count consistency
4. Custom extension JSON validity
5. SQL syntax (using sqlglot)

Usage:
python validation/validate.py <yaml_file>
Expand Down Expand Up @@ -70,6 +71,11 @@ def validate_unique_names(data: dict) -> list[str]:
"""Validate unique names for datasets, fields, metrics, relationships."""
errors = []

# Check unique semantic model names
model_names = [m.get("name") for m in data.get("semantic_model", []) if m.get("name")]
for dup in find_duplicates(model_names):
errors.append(f"[Unique] Duplicate semantic model name '{dup}'")

for model in data.get("semantic_model", []):
model_name = model.get("name", "<unnamed>")

Expand Down Expand Up @@ -116,6 +122,90 @@ def validate_references(data: dict) -> list[str]:
if to_ds and to_ds not in dataset_names:
errors.append(f"[Reference] Relationship '{rel_name}' references unknown dataset '{to_ds}'")

# Check from_columns and to_columns have the same length
from_cols = rel.get("from_columns", [])
to_cols = rel.get("to_columns", [])
if from_cols and to_cols and len(from_cols) != len(to_cols):
errors.append(
f"[Reference] Relationship '{rel_name}': from_columns has {len(from_cols)} column(s) "
f"but to_columns has {len(to_cols)} column(s)"
)

return errors


def validate_custom_extensions(data: dict) -> list[str]:
"""Validate that custom extension 'data' fields contain valid JSON."""
errors = []

for model in data.get("semantic_model", []):
model_name = model.get("name", "<unnamed>")

# Collect all (context, extensions) pairs to check
extension_sources = [(f"model '{model_name}'", model.get("custom_extensions", []))]

for dataset in model.get("datasets", []):
ds_name = dataset.get("name", "<unnamed>")
extension_sources.append((f"dataset '{ds_name}'", dataset.get("custom_extensions", [])))
for field in dataset.get("fields", []):
field_name = field.get("name", "<unnamed>")
extension_sources.append(
(f"field '{ds_name}.{field_name}'", field.get("custom_extensions", []))
)

for metric in model.get("metrics", []):
metric_name = metric.get("name", "<unnamed>")
extension_sources.append((f"metric '{metric_name}'", metric.get("custom_extensions", [])))

for rel in model.get("relationships", []):
rel_name = rel.get("name", "<unnamed>")
extension_sources.append((f"relationship '{rel_name}'", rel.get("custom_extensions", [])))

for context, extensions in extension_sources:
for ext in extensions or []:
vendor = ext.get("vendor_name", "<unknown>")
ext_data = ext.get("data")
if ext_data is not None:
try:
json.loads(ext_data)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data field from YAML will already be parsed as a Python object (dist/list/int), not a JSON string.

json.loads() expects a string. If someone writes:

custom_extensions:
  - vendor_name: acme
     data: '{"key": "value"}'              # string - works
  - vendor_name: acme
     data:                                            # YAML dict - TypeError from json.loads
         key: value

The TypeError is caught, but the error message would be misleading. It would say "invalid JSON" when really the data is valid structured data, just not a JSON string.

The validation should clarify what it actually expects: is data supposed to be JSON-encoded string, or a structured object ?

If either is acceptable, the check should only run json.loads when ext_data is a str.

except (json.JSONDecodeError, TypeError) as e:
errors.append(
f"[Extension] Custom extension for vendor '{vendor}' in {context} "
f"has invalid JSON in 'data': {e}"
)

return errors


def validate_duplicate_dialects(data: dict) -> list[str]:
"""Validate that no expression has duplicate dialect entries."""
errors = []

for model in data.get("semantic_model", []):
# Check field expressions
for dataset in model.get("datasets", []):
ds_name = dataset.get("name", "<unnamed>")
for field in dataset.get("fields", []):
field_name = field.get("name", "<unnamed>")
dialects = [
d.get("dialect")
for d in field.get("expression", {}).get("dialects", [])
if d.get("dialect")
]
for dup in find_duplicates(dialects):
errors.append(f"[Unique] Duplicate dialect '{dup}' in field '{ds_name}.{field_name}'")

# Check metric expressions
for metric in model.get("metrics", []):
metric_name = metric.get("name", "<unnamed>")
dialects = [
d.get("dialect")
for d in metric.get("expression", {}).get("dialects", [])
if d.get("dialect")
]
for dup in find_duplicates(dialects):
errors.append(f"[Unique] Duplicate dialect '{dup}' in metric '{metric_name}'")

return errors


Expand Down Expand Up @@ -217,6 +307,8 @@ def main():
errors.extend(validate_schema(data, schema))
errors.extend(validate_unique_names(data))
errors.extend(validate_references(data))
errors.extend(validate_custom_extensions(data))
errors.extend(validate_duplicate_dialects(data))
errors.extend(validate_sql(data))

# Report results
Expand Down