open-semantic-interchange · jonmmease · Apr 24, 2026 · KSDaemon · Apr 27, 2026 · jonmmease
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+*.py[cod]
diff --git a/converters/index.md b/converters/index.md
@@ -93,11 +93,17 @@ Datasets represent logical tables (fact or dimension tables). They contain field
 
 Fields represent row-level attributes. They can be simple column references or computed expressions.
 
+> **Note:** `datatype` (on `Field` and `Metric`) declares a field's logical data type; `dimension.is_time` is an independent temporal-role marker.
+> A field may carry both, either, or neither. Use `datatype` for data-type questions (casting, serialization); use `is_time` for role questions
+> (classifying time dimensions). When `is_time` is unset it defaults to `true` if `datatype` is one of `date`, `time`, `timestamp`, `timestamp_tz`,
+> and `false` otherwise. Explicit `is_time` always wins.
+
 | OSI Field | Description | Converter Consideration |
 |-----------|-------------|------------------------|
 | `name` | Field identifier | Map to column/attribute name |
 | `expression.dialects` | Multi-dialect SQL expressions | Select the dialect matching the target vendor; fall back to `ANSI_SQL` |
-| `dimension.is_time` | Whether the field is a time dimension | Map to vendor-specific time dimension markers |
+| `datatype` | Logical data type of the field (one of `string`, `integer`, `number`, `boolean`, `date`, `time`, `timestamp`, `timestamp_tz`, `other`). | Converters SHOULD consult `datatype` for the field's data type; prefer the temporal members (`date`, `time`, `timestamp`, `timestamp_tz`) to classify time dimensions. Use `other` + `custom_extensions` for types not covered by the enum. |
+| `dimension.is_time` | Temporal-role marker. When `true`, the field should be treated as a time dimension regardless of its `datatype` (e.g. an integer year grain, a string month name, or a date column). When unset, defaults to `true` for temporal `datatype`s (`date`, `time`, `timestamp`, `timestamp_tz`) and `false` otherwise. | Map to vendor-specific time dimension markers. Converters SHOULD classify as a time dimension when `is_time` resolves to `true` (either explicit or defaulted from a temporal `datatype`). An explicit `is_time: false` suppresses the time-dimension classification even on temporal-typed columns. |
 | `label` | Categorization label | Map if vendor supports field labels/tags |
 | `description` | Human-readable description | Most vendors support field descriptions |
 | `ai_context` | Synonyms and business context | Map if vendor supports semantic annotations |
@@ -144,6 +150,7 @@ Metrics are aggregate measures defined at the semantic model level. They can spa
 |-----------|-------------|------------------------|
 | `name` | Metric identifier | Map to vendor's measure/KPI name |
 | `expression.dialects` | Multi-dialect aggregate expressions | Select the appropriate dialect; fall back to `ANSI_SQL` |
+| `datatype` | Logical data type of the metric result (one of `string`, `integer`, `number`, `boolean`, `date`, `time`, `timestamp`, `timestamp_tz`, `other`). | Converters SHOULD consult `datatype` to declare the result type of the aggregation. Most numeric measures will be `number` or `integer`; use `other` + `custom_extensions` for types not covered by the enum. |
 | `description` | What the metric measures | Most vendors support descriptions |
 | `ai_context` | Synonyms and business context | Map if vendor supports semantic annotations |
 

diff --git a/converters/snowflake/src/osi_to_snowflake_yaml_converter.py b/converters/snowflake/src/osi_to_snowflake_yaml_converter.py
@@ -16,6 +16,8 @@
 
 SUPPORTED_VERSION = "0.1.1"
 
+_TIME_DATATYPES = frozenset({"date", "time", "timestamp", "timestamp_tz"})
+
 
 class OsiConversionError(Exception):
     """Raised when an OSI YAML cannot be converted to Snowflake format."""
@@ -199,11 +201,31 @@ def _convert_dataset(dataset):
 
 
 def _classify_field(field):
-    """Returns 'dimension', 'time_dimension', or 'fact' based on field structure."""
+    """Classify a field as 'fact', 'dimension', or 'time_dimension'.
+
+    ``datatype`` declares the field's data type; ``dimension.is_time`` is
+    an independent temporal-role marker. Classification rules:
+
+    - A field with no ``dimension`` block is a ``fact`` regardless of
+      ``datatype`` (data type does not imply role).
+    - Explicit ``dimension.is_time`` always wins: ``True`` classifies as
+      ``time_dimension``; ``False`` classifies as ``dimension`` even when
+      ``datatype`` is temporal (author opt-out for e.g. audit timestamps).
+    - When ``dimension.is_time`` is unset, it defaults to ``True`` for
+      temporal ``datatype`` values (``date``, ``time``, ``timestamp``,
+      ``timestamp_tz``) and ``False`` otherwise.
+    """
     dimension = field.get("dimension")
     if dimension is None:
         return "fact"
-    if isinstance(dimension, dict) and dimension.get("is_time") is True:
+    is_time = dimension.get("is_time") if isinstance(dimension, dict) else None
+    if is_time is True:
+        return "time_dimension"
+    if is_time is False:
+        return "dimension"
+    # is_time is unset; default from datatype
+    datatype = field.get("datatype")
+    if datatype in _TIME_DATATYPES:
         return "time_dimension"
     return "dimension"
 

diff --git a/converters/snowflake/tests/test_osi_to_snowflake_yaml_converter.py b/converters/snowflake/tests/test_osi_to_snowflake_yaml_converter.py
@@ -178,6 +178,58 @@ def test_dimension_bare_true(self):
     def test_dimension_none_is_fact(self):
         assert _classify_field({"dimension": None}) == "fact"
 
+    def test_datatype_timestamp_is_time_dimension(self):
+        assert _classify_field(
+            {"dimension": {}, "datatype": "timestamp"}
+        ) == "time_dimension"
+
+    def test_datatype_date_is_time_dimension(self):
+        assert _classify_field(
+            {"dimension": {}, "datatype": "date"}
+        ) == "time_dimension"
+
+    def test_datatype_time_is_time_dimension(self):
+        assert _classify_field(
+            {"dimension": {}, "datatype": "time"}
+        ) == "time_dimension"
+
+    def test_datatype_timestamp_tz_is_time_dimension(self):
+        assert _classify_field(
+            {"dimension": {}, "datatype": "timestamp_tz"}
+        ) == "time_dimension"
+
+    def test_datatype_string_is_dimension(self):
+        assert _classify_field(
+            {"dimension": {}, "datatype": "string"}
+        ) == "dimension"
+
+    def test_datatype_other_is_dimension(self):
+        assert _classify_field(
+            {"dimension": {}, "datatype": "other"}
+        ) == "dimension"
+
+    def test_is_time_preserved_when_datatype_non_temporal(self):
+        """A dimension with is_time=True is classified as a time_dimension
+        even when datatype is non-temporal, because is_time is an
+        independent role marker (e.g., d_year with datatype: integer and
+        is_time: true is a time-role integer grain)."""
+        assert _classify_field(
+            {"dimension": {"is_time": True}, "datatype": "integer"}
+        ) == "time_dimension"
+
+    def test_is_time_false_suppresses_time_dimension_on_temporal_datatype(self):
+        """Explicit is_time: false is an author opt-out for temporal
+        columns (e.g., an audit created_at that should not appear on
+        the time axis). Explicit is_time always wins over the default."""
+        assert _classify_field(
+            {"dimension": {"is_time": False}, "datatype": "timestamp"}
+        ) == "dimension"
+
+    def test_no_dimension_with_temporal_datatype_is_still_fact(self):
+        """A temporal datatype on a field with no dimension block is still
+        a fact; type does not imply role."""
+        assert _classify_field({"datatype": "timestamp"}) == "fact"
+
 
 # ---------------------------------------------------------------------------
 # _extract_expression

diff --git a/core-spec/osi-schema.json b/core-spec/osi-schema.json
@@ -122,13 +122,28 @@
       "required": ["dialects"],
       "additionalProperties": false
     },
+    "Datatype": {
+      "type": "string",
+      "enum": [
+        "string",
+        "integer",
+        "number",
+        "boolean",
+        "date",
+        "time",
+        "timestamp",
+        "timestamp_tz",
+        "other"
+      ],
+      "description": "Logical data type for fields and metrics. Describes what kind of value is stored, independent of role (e.g. dimension vs fact). Use `other` plus `custom_extensions` for vendor-specific types not covered by the enum."
+    },
     "Dimension": {
       "type": "object",
       "description": "Dimension metadata",
       "properties": {
         "is_time": {
           "type": "boolean",
-          "description": "Indicates if this is a time-based dimension for temporal filtering"
+          "description": "Temporal-role marker. When true, consumers that distinguish time dimensions (e.g. for time-series analysis or temporal filtering) should treat this field as a time dimension. This is a *role* flag, independent of the field's data type: a field with `is_time: true` may carry any `datatype` (e.g. `integer` for a year grain, `string` for a month name, as well as temporal datatypes). When `is_time` is unset, it defaults to `true` if `datatype` is one of `date`, `time`, `timestamp`, or `timestamp_tz`, and `false` otherwise. Set `is_time: false` explicitly to opt a temporal-typed column (such as an audit timestamp) out of time-dimension treatment."
         }
       },
       "additionalProperties": false
@@ -155,6 +170,9 @@
           "type": "string",
           "description": "Human-readable description"
         },
+        "datatype": {
+          "$ref": "#/$defs/Datatype"
+        },
         "ai_context": {
           "$ref": "#/$defs/AIContext"
         },
@@ -280,6 +298,9 @@
           "type": "string",
           "description": "Human-readable description of what the metric measures"
         },
+        "datatype": {
+          "$ref": "#/$defs/Datatype"
+        },
         "ai_context": {
           "$ref": "#/$defs/AIContext"
         },

diff --git a/core-spec/spec.md b/core-spec/spec.md
@@ -36,6 +36,22 @@ Supported SQL and expression language dialects for metrics and field definitions
 | `TABLEAU` | Tableau calculations |
 | `DATABRICKS` | Databricks SQL |
 
+### Datatypes
+
+Logical data types for fields and metrics.
+
+| Datatype | Description |
+|----------|-------------|
+| `string` | Variable-length Unicode character data. |
+| `integer` | Signed integer with no scale. |
+| `number` | Real number (floating-point or decimal) with unspecified precision. |
+| `boolean` | Logical two-valued truth type. |
+| `date` | Calendar date with no time-of-day component. |
+| `time` | Time-of-day with no date component. |
+| `timestamp` | Instant-in-time without timezone offset (naive / local). |
+| `timestamp_tz` | Instant-in-time with timezone offset (zoned). |
+| `other` | Any data type not covered above; use `custom_extensions` for vendor-specific refinement. |
+
 ### Vendors
 
 Supported vendors for custom extensions and integrations.
@@ -202,6 +218,7 @@ Fields represent row-level attributes that can be used for grouping, filtering,
 | `dimension` | object | No | Dimension metadata (e.g., `is_time` flag) |
 | `label` | string | No | Label for categorization |
 | `description` | string | No | Human-readable description |
+| `datatype` | string (enum) | No | Logical data type for this field. See [Datatypes](#datatypes). |
 | `ai_context` | string/object | No | Additional context for AI tools (e.g., synonyms) |
 | `custom_extensions` | array | No | Vendor-specific attributes |
 
@@ -228,7 +245,7 @@ expression:
 
 | Field | Type | Description |
 |-------|------|-------------|
-| `is_time` | boolean | Indicates if this is a time-based dimension for temporal filtering |
+| `is_time` | boolean | Temporal-role marker. When `true`, consumers that distinguish time dimensions (e.g. for time-series analysis or temporal filtering) should treat this field as a time dimension. This is a *role* flag, independent of the field's data type. See [Datatype and `is_time`: type vs. role](#datatype-and-is_time-type-vs-role). |
 
 ### Examples
 
@@ -268,6 +285,7 @@ expression:
     dialects:
       - dialect: ANSI_SQL
         expression: order_date
+  datatype: date
   dimension:
     is_time: true
   description: Date when order was placed
@@ -290,6 +308,33 @@ expression:
   description: Normalized email address
 ```
 
+### Datatype and `is_time`: type vs. role
+
+`datatype` and `dimension.is_time` are independent properties that answer different questions:
+
+- **`datatype`** describes the *data type* of the field (e.g. `date`, `integer`, `string`, `timestamp_tz`): what kind of values the field holds.
+- **`dimension.is_time`** is a *temporal-role marker*: whether the field should be treated as a time dimension for time-series analysis or temporal filtering, regardless of its data type.
+
+**Default for `is_time`.** When `is_time` is not set explicitly, it defaults to `true` if `datatype` is one of `date`, `time`, `timestamp`, `timestamp_tz`, and `false` otherwise. Explicit `is_time` always wins. Set `is_time: false` on a temporal-typed column (e.g. an audit `created_at` you don't want on the time axis) to opt out of the default.
+
+Common combinations:
+
+| Column example | `datatype` | `is_time` | Effective role | Why |
+|---|---|---|---|---|
+| `d_date` (calendar date) | `date` | omitted | time dimension | Temporal `datatype`; `is_time` defaults to `true`. |
+| `order_timestamp` | `timestamp_tz` | omitted | time dimension | Same. |
+| `created_at` (audit timestamp) | `timestamp` | `false` | regular dimension | Explicit opt-out of the temporal default. |
+| `d_year` (integer year grain) | `integer` | `true` | time dimension | Non-temporal `datatype`; `is_time: true` makes the role explicit. |
+| `d_quarter_name` (e.g. `"Q1"`) | `string` | `true` | time dimension | String-valued temporal grain. |
+| `customer_id` | `integer` | omitted | regular dimension | Non-temporal `datatype`; `is_time` defaults to `false`. |
+
+> **Precedent.** This type/role separation mirrors [Snowflake Semantic Views' YAML authoring form](https://docs.snowflake.com/en/user-guide/views-semantic/semantic-view-yaml-spec), which has a structural `time_dimensions:` collection whose entries can carry any `data_type`. The published example annotates `order_year` with `data_type: NUMBER`. LookML supports a similar split via its [`dimension_group`](https://cloud.google.com/looker/docs/reference/param-field-dimension-group), whose `datatype` enum covers `date`, `datetime`, `timestamp`, plus the integer-encoded forms `epoch` and `yyyymmdd`.
+
+**Consumer guidance.**
+
+- For *data-type* questions (casting, serialization, downstream type inference): prefer `datatype` when present. If only `is_time: true` is set, do not infer a specific scalar type from it.
+- For *role* questions (classifying time dimensions in a query UI, generating time-series output sections, choosing time-aware aggregations): treat the field as a time dimension when `is_time` resolves to `true`, whether explicitly set or defaulted from a temporal `datatype`.
+
 ---
 
 ## Metrics
@@ -303,6 +348,7 @@ Quantitative measures defined on business data, representing key calculations li
 | `name` | string | Yes | Unique identifier for the metric |
 | `expression` | object | Yes | Expression definition with dialect support |
 | `description` | string | No | Human-readable description of what the metric measures |
+| `datatype` | string (enum) | No | Logical data type for this metric. See [Datatypes](#datatypes). |
 | `ai_context` | string/object | No | Additional context for AI tools (e.g., synonyms) |
 | `custom_extensions` | array | No | Vendor-specific attributes |
 
@@ -324,9 +370,11 @@ expression:
 ```yaml
 - name: total_revenue
   expression:
-    - dialect: ANSI_SQL
-      expression: SUM(orders.amount)
+    dialects:
+      - dialect: ANSI_SQL
+        expression: SUM(orders.amount)
   description: Total revenue across all orders
+  datatype: number
   ai_context:
     synonyms:
       - "total sales"
@@ -338,9 +386,11 @@ expression:
 ```yaml
 - name: avg_orders
   expression:
-    - dialect: ANSI_SQL
-      expression: SUM(orders.amount) / COUNT(DISTINCT customers.id)
+    dialects:
+      - dialect: ANSI_SQL
+        expression: SUM(orders.amount) / COUNT(DISTINCT customers.id)
   description: Average orders
+  datatype: number
   ai_context:
     synonyms:
       - "Order Average by customer"
@@ -446,6 +496,7 @@ semantic_model:
               dialects:
                 - dialect: ANSI_SQL
                   expression: order_date
+            datatype: date
             dimension:
               is_time: true
             description: Order date

diff --git a/core-spec/spec.yaml b/core-spec/spec.yaml
@@ -168,8 +168,17 @@ fields:
     # Optional: Dimension metadata
     # Indicates this field can be used as a dimension for grouping/filtering
     dimension:
-      # Optional: Indicates if this is a time-based dimension
-      # Used for time-series analysis and temporal filtering
+      # Optional: Temporal-role marker
+      # When true, consumers should treat this field as a time dimension
+      # for time-series analysis and temporal filtering. This is a *role*
+      # flag, independent of the field's data type. A field with
+      # is_time: true may carry any datatype (e.g. integer for a year
+      # grain, string for a month name, date/timestamp for a date column).
+      #
+      # Default: when unset, is_time defaults to true if datatype is one
+      # of date, time, timestamp, timestamp_tz, and false otherwise. Set
+      # is_time: false explicitly to opt a temporal-typed column out of
+      # time-dimension treatment.
       is_time: boolean
 
     # Optional: Label for categorization (e.g., "filter")
@@ -178,6 +187,11 @@ fields:
     # Optional: Human-readable description of the field
     description: string
 
+    # Optional: Logical data type for this field
+    # One of: string, integer, number, boolean, date, time, timestamp, timestamp_tz, other
+    # Use "other" + custom_extensions for vendor-specific types
+    datatype: string
+
     # Optional: Additional context for AI tools (e.g., synonyms, business terms)
     # Helps LLMs understand the field meaning and generate better queries
     ai_context: string
@@ -207,6 +221,11 @@ metrics:
     # Should explain what the metric measures and how it's used
     description: string
 
+    # Optional: Logical data type for this metric
+    # One of: string, integer, number, boolean, date, time, timestamp, timestamp_tz, other
+    # Use "other" + custom_extensions for vendor-specific types
+    datatype: string
+
     # Optional: Additional context for AI tools (e.g., synonyms, business context)
     # Helps LLMs understand the metric meaning and suggest it appropriately
     ai_context: string

diff --git a/docs/index.md b/docs/index.md
@@ -365,7 +365,7 @@ A practical guide for organizations looking to adopt OSI.
 |------|------------|
 | **Semantic Model** | A structured description of business data that defines datasets, fields, relationships, and metrics. It provides a shared vocabulary for interpreting data across tools and teams. |
 | **Dataset** | A logical representation of a business entity, typically corresponding to a fact table or dimension table in a data warehouse. |
-| **Field** | A row-level attribute within a dataset, used for grouping, filtering, or as part of metric expressions. Fields can be simple column references or computed expressions. |
+| **Field** | A row-level attribute within a dataset, used for grouping, filtering, or as part of metric expressions. Fields can be simple column references or computed expressions. A field's logical data type is declared by the optional top-level `datatype` field (one of `string`, `integer`, `number`, `boolean`, `date`, `time`, `timestamp`, `timestamp_tz`, or `other`). |
 | **Dimension** | A categorical attribute used to slice and filter data (e.g., region, product category, date). In OSI, dimensions are represented as fields with optional metadata such as `is_time`. |
 | **Metric** | A quantitative measure computed by aggregating data across one or more datasets (e.g., total revenue, average order value). Metrics are defined at the semantic model level. |
 | **Relationship** | A foreign key connection between two datasets, defining how they can be joined. Relationships are always many-to-one (from the referencing dataset to the referenced dataset). |