From 350bbdf3b98eb40ab23b1dc1ab90e36e163d059e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20S=2E?= Date: Thu, 16 Apr 2026 14:16:35 +0200 Subject: [PATCH] Add grammar doc generator and doc-comments to grammar.y Add a script (meta/src/doc-gen/grammar_to_markdown.py) that parses grammar.y and generates a clean Markdown reference of the S-expression syntax, stripping construct/deconstruct actions and protobuf details. Non-terminal references in production bodies link to their definitions. Add doc-comments (# ...) to all grammar rules in grammar.y. Comments immediately above a rule name are extracted by the doc generator and rendered as descriptions in the output. Generate docs/grammar.md from the annotated grammar. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/grammar.md | 892 ++++++++++++++++++++++++ meta/src/doc-gen/grammar_to_markdown.py | 196 ++++++ meta/src/meta/grammar.y | 108 +++ 3 files changed, 1196 insertions(+) create mode 100644 docs/grammar.md create mode 100644 meta/src/doc-gen/grammar_to_markdown.py diff --git a/docs/grammar.md b/docs/grammar.md new file mode 100644 index 00000000..b6e01af5 --- /dev/null +++ b/docs/grammar.md @@ -0,0 +1,892 @@ +# LQP S-Expression Grammar + +This document describes the S-expression syntax of the Logical Query Protocol. +It is auto-generated from `grammar.y` with construct/deconstruct actions removed. + +## Terminals + +- **DECIMAL** — `[-]?\d+\.\d+d\d+` +- **FLOAT32** — `([-]?\d+\.\d+f32|inf32|nan32)` +- **FLOAT** — `([-]?\d+\.\d+|inf|nan)` +- **INT32** — `[-]?\d+i32` +- **INT** — `[-]?\d+` +- **UINT32** — `\d+u32` +- **INT128** — `[-]?\d+i128` +- **STRING** — `"(?:[^"\\]|\\.)*"` +- **SYMBOL** — `[a-zA-Z_][a-zA-Z0-9_.#/-]*` +- **UINT128** — `0x[0-9a-fA-F]+` + +### Token Aliases (formatted variants) + +These are display variants used in the pretty printer; they parse identically to the base token. + +- **FORMATTED_DECIMAL** → DECIMAL +- **FORMATTED_FLOAT** → FLOAT +- **FORMATTED_FLOAT32** → FLOAT32 +- **FORMATTED_INT** → INT +- **FORMATTED_INT128** → INT128 +- **FORMATTED_INT32** → INT32 +- **FORMATTED_STRING** → STRING +- **FORMATTED_UINT128** → UINT128 +- **FORMATTED_UINT32** → UINT32 + +## Grammar Rules + +### transaction + +The top-level unit of communication. Groups one or more epochs +with optional configuration and synchronization directives. + + `(` `transaction` [configure](#configure)? [sync](#sync)? [epoch](#epoch)* `)` + +### configure + +Transaction-level configuration settings (e.g., IVM maintenance level, semantics version). + + `(` `configure` [config_dict](#config_dict) `)` + +### config_dict + +A dictionary of key-value pairs enclosed in braces. + + `{` [config_key_value](#config_key_value)* `}` + +### config_key_value + +A single key-value entry in a config dictionary. The key is a colon-prefixed symbol. + + `:` SYMBOL [raw_value](#raw_value) + +### value + +A typed constant value. Uses formatted token variants for pretty-printable output. + + [date](#date) +| [datetime](#datetime) +| FORMATTED_STRING +| FORMATTED_INT32 +| FORMATTED_INT +| FORMATTED_FLOAT32 +| FORMATTED_FLOAT +| FORMATTED_UINT32 +| FORMATTED_UINT128 +| FORMATTED_INT128 +| FORMATTED_DECIMAL +| `missing` +| [boolean_value](#boolean_value) + +### raw_value + +A typed constant value using raw (unformatted) token variants. + + [raw_date](#raw_date) +| [raw_datetime](#raw_datetime) +| STRING +| INT32 +| INT +| FLOAT32 +| FLOAT +| UINT32 +| UINT128 +| INT128 +| DECIMAL +| `missing` +| [boolean_value](#boolean_value) + +### raw_date + +A date literal with year, month, day components (raw token variant). + + `(` `date` INT INT INT `)` + +### date + +A date literal with year, month, day components (formatted token variant). + + `(` `date` FORMATTED_INT FORMATTED_INT FORMATTED_INT `)` + +### raw_datetime + +A datetime literal with year, month, day, hour, minute, second, and optional +microsecond components (raw token variant). + + `(` `datetime` INT INT INT INT INT INT INT? `)` + +### datetime + +A datetime literal with year, month, day, hour, minute, second, and optional +microsecond components (formatted token variant). + + `(` `datetime` FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT? `)` + +### boolean_value + +A boolean literal: `true` or `false`. + + `true` +| `false` + +### sync + +Synchronization directive listing fragments that must be loaded before evaluation. + + `(` `sync` [fragment_id](#fragment_id)* `)` + +### fragment_id + +A colon-prefixed identifier for a fragment. + + `:` SYMBOL + +### epoch + +An epoch is a unit of execution within a transaction, containing optional writes and reads. + + `(` `epoch` [epoch_writes](#epoch_writes)? [epoch_reads](#epoch_reads)? `)` + +### epoch_writes + +The write section of an epoch, containing zero or more write operations. + + `(` `writes` [write](#write)* `)` + +### write + +A single write operation: define, undefine, context, or snapshot. + + [define](#define) +| [undefine](#undefine) +| [context](#context) +| [snapshot](#snapshot) + +### define + +Installs a fragment of declarations into the database. + + `(` `define` [fragment](#fragment) `)` + +### fragment + +A named group of declarations (defs, algorithms, constraints, data). + + `(` `fragment` [new_fragment_id](#new_fragment_id) [declaration](#declaration)* `)` + +### new_fragment_id + + [fragment_id](#fragment_id) + +### declaration + +A single declaration within a fragment. + + [def](#def) +| [algorithm](#algorithm) +| [constraint](#constraint) +| [data](#data) + +### def + +A rule definition: binds a relation name to an abstraction (the rule body), +with optional attributes. + + `(` `def` [relation_id](#relation_id) [abstraction](#abstraction) [attrs](#attrs)? `)` + +### relation_id + +Identifies a relation, either by a colon-prefixed symbolic name or a numeric hash. + + `:` SYMBOL +| UINT128 + +### abstraction + +A lambda-like construct: a list of typed variable bindings followed by a formula body. + + `(` [bindings](#bindings) [formula](#formula) `)` + +### bindings + +A bracketed list of variable bindings, with an optional value-bindings section +separated by `|`. + + `[` [binding](#binding)* [value_bindings](#value_bindings)? `]` + +### binding + +A single typed variable binding: `name :: type`. + + SYMBOL `::` [type](#type) + +### type + +A type annotation for a variable binding. + + [unspecified_type](#unspecified_type) +| [string_type](#string_type) +| [int_type](#int_type) +| [float_type](#float_type) +| [uint128_type](#uint128_type) +| [int128_type](#int128_type) +| [date_type](#date_type) +| [datetime_type](#datetime_type) +| [missing_type](#missing_type) +| [decimal_type](#decimal_type) +| [boolean_type](#boolean_type) +| [int32_type](#int32_type) +| [float32_type](#float32_type) +| [uint32_type](#uint32_type) + +### unspecified_type + + `UNKNOWN` + +### string_type + + `STRING` + +### int32_type + + `INT32` + +### int_type + + `INT` + +### float32_type + + `FLOAT32` + +### float_type + + `FLOAT` + +### uint32_type + + `UINT32` + +### uint128_type + + `UINT128` + +### int128_type + + `INT128` + +### date_type + + `DATE` + +### datetime_type + + `DATETIME` + +### missing_type + + `MISSING` + +### decimal_type + +A fixed-point decimal type with precision and scale parameters. + + `(` `DECIMAL` INT INT `)` + +### boolean_type + + `BOOLEAN` + +### value_bindings + +The value-bindings section of a bindings list, separated from key bindings by `|`. +Used to distinguish key and value variables in upsert and monoid operations. + + `|` [binding](#binding)* + +### formula + +A logical formula: the body of an abstraction. Can be a conjunction, disjunction, +negation, existential quantification, reduction, atom, primitive, or other form. + + [true](#true) +| [false](#false) +| [exists](#exists) +| [reduce](#reduce) +| [conjunction](#conjunction) +| [disjunction](#disjunction) +| [not](#not) +| [ffi](#ffi) +| [atom](#atom) +| [pragma](#pragma) +| [primitive](#primitive) +| [rel_atom](#rel_atom) +| [cast](#cast) + +### true + +The trivially true formula (empty conjunction). + + `(` `true` `)` + +### false + +The trivially false formula (empty disjunction). + + `(` `false` `)` + +### exists + +Existential quantification: introduces locally scoped variables. + + `(` `exists` [bindings](#bindings) [formula](#formula) `)` + +### reduce + +Aggregation: applies a binary operator (op) over the results of a body abstraction, +with initial seed terms. + + `(` `reduce` [abstraction](#abstraction) [abstraction](#abstraction) [terms](#terms) `)` + +### term + +A term is either a variable or a constant value. + + [var](#var) +| [value](#value) + +### var + +A variable reference. + + SYMBOL + +### conjunction + +Logical AND of zero or more formulas. + + `(` `and` [formula](#formula)* `)` + +### disjunction + +Logical OR of zero or more formulas. + + `(` `or` [formula](#formula)* `)` + +### not + +Logical negation of a formula. + + `(` `not` [formula](#formula) `)` + +### ffi + +A foreign function interface call with a name, abstraction arguments, and terms. + + `(` `ffi` [name](#name) [ffi_args](#ffi_args) [terms](#terms) `)` + +### ffi_args + +The argument abstractions of an FFI call. + + `(` `args` [abstraction](#abstraction)* `)` + +### terms + +A parenthesized list of terms. + + `(` `terms` [term](#term)* `)` + +### name + +A colon-prefixed symbolic name. + + `:` SYMBOL + +### atom + +A relational atom: applies a named relation to a list of terms. + + `(` `atom` [relation_id](#relation_id) [term](#term)* `)` + +### pragma + +A compiler pragma: a named directive with term arguments. + + `(` `pragma` [name](#name) [term](#term)* `)` + +### primitive + +A built-in primitive operation. Includes syntactic sugar for common comparisons +and arithmetic (`=`, `<`, `+`, etc.) as well as a generic named form. + + [eq](#eq) +| [lt](#lt) +| [lt_eq](#lt_eq) +| [gt](#gt) +| [gt_eq](#gt_eq) +| [add](#add) +| [minus](#minus) +| [multiply](#multiply) +| [divide](#divide) +| `(` `primitive` [name](#name) [rel_term](#rel_term)* `)` + +### eq + + `(` `=` [term](#term) [term](#term) `)` + +### lt + + `(` `<` [term](#term) [term](#term) `)` + +### lt_eq + + `(` `<=` [term](#term) [term](#term) `)` + +### gt + + `(` `>` [term](#term) [term](#term) `)` + +### gt_eq + + `(` `>=` [term](#term) [term](#term) `)` + +### add + + `(` `+` [term](#term) [term](#term) [term](#term) `)` + +### minus + + `(` `-` [term](#term) [term](#term) [term](#term) `)` + +### multiply + + `(` `*` [term](#term) [term](#term) [term](#term) `)` + +### divide + + `(` `/` [term](#term) [term](#term) [term](#term) `)` + +### rel_term + +A relational term: either a regular term or a specialized (hash-prefixed) constant value. + + [specialized_value](#specialized_value) +| [term](#term) + +### specialized_value + +A hash-prefixed constant value used for type specialization in primitives and rel_atoms. + + `#` [raw_value](#raw_value) + +### rel_atom + +A relational atom with support for specialized value terms. + + `(` `relatom` [name](#name) [rel_term](#rel_term)* `)` + +### cast + +A type cast from an input term to a result term. + + `(` `cast` [term](#term) [term](#term) `)` + +### attrs + +A list of attributes attached to a def, algorithm, or instruction. + + `(` `attrs` [attribute](#attribute)* `)` + +### attribute + +A single named attribute with zero or more value arguments. + + `(` `attribute` [name](#name) [raw_value](#raw_value)* `)` + +### algorithm + +An imperative algorithm declaration with global relation references, a script body, +and optional attributes. + + `(` `algorithm` [relation_id](#relation_id)* [script](#script) [attrs](#attrs)? `)` + +### script + +The body of an algorithm: a sequence of constructs (loops and instructions). + + `(` `script` [construct](#construct)* `)` + +### construct + +A single construct within a script: either a loop or an instruction. + + [loop](#loop) +| [instruction](#instruction) + +### loop + +A loop construct with initialization instructions and a script body. + + `(` `loop` [init](#init) [script](#script) [attrs](#attrs)? `)` + +### init + +The initialization block of a loop. + + `(` `init` [instruction](#instruction)* `)` + +### instruction + +A single imperative instruction within a script or loop. + + [assign](#assign) +| [upsert](#upsert) +| [break](#break) +| [monoid_def](#monoid_def) +| [monus_def](#monus_def) + +### assign + +Assigns a relation to the result of an abstraction (replaces existing tuples). + + `(` `assign` [relation_id](#relation_id) [abstraction](#abstraction) [attrs](#attrs)? `)` + +### upsert + +Merges tuples into a relation using a monoid-based update (insert or update existing). + + `(` `upsert` [relation_id](#relation_id) [abstraction_with_arity](#abstraction_with_arity) [attrs](#attrs)? `)` + +### abstraction_with_arity + +An abstraction that distinguishes key bindings from value bindings via the `|` separator. +The value arity is derived from the number of value bindings. + + `(` [bindings](#bindings) [formula](#formula) `)` + +### break + +A loop termination condition: breaks when the relation matches the abstraction. + + `(` `break` [relation_id](#relation_id) [abstraction](#abstraction) [attrs](#attrs)? `)` + +### monoid_def + +Defines an aggregation over a relation using a monoid (or, min, max, sum). + + `(` `monoid` [monoid](#monoid) [relation_id](#relation_id) [abstraction_with_arity](#abstraction_with_arity) [attrs](#attrs)? `)` + +### monoid + +The type of aggregation monoid. + + [or_monoid](#or_monoid) +| [min_monoid](#min_monoid) +| [max_monoid](#max_monoid) +| [sum_monoid](#sum_monoid) + +### or_monoid + + `(` `or` `)` + +### min_monoid + + `(` `min` [type](#type) `)` + +### max_monoid + + `(` `max` [type](#type) `)` + +### sum_monoid + + `(` `sum` [type](#type) `)` + +### monus_def + +Defines a monus (subtraction) operation over a relation using a monoid. + + `(` `monus` [monoid](#monoid) [relation_id](#relation_id) [abstraction_with_arity](#abstraction_with_arity) [attrs](#attrs)? `)` + +### constraint + +A functional dependency constraint on a relation: given the key variables, +the value variables are uniquely determined. + + `(` `functional_dependency` [relation_id](#relation_id) [abstraction](#abstraction) [functional_dependency_keys](#functional_dependency_keys) [functional_dependency_values](#functional_dependency_values) `)` + +### functional_dependency_keys + + `(` `keys` [var](#var)* `)` + +### functional_dependency_values + + `(` `values` [var](#var)* `)` + +### data + +A data source declaration: external data (EDB), B-tree, CSV, or Iceberg. + + [edb](#edb) +| [betree_relation](#betree_relation) +| [csv_data](#csv_data) +| [iceberg_data](#iceberg_data) + +### edb_path + + `[` STRING* `]` + +### edb_types + + `[` [type](#type)* `]` + +### edb + +An extensional database (EDB) declaration: maps a relation to stored data at a given +path with specified column types. + + `(` `edb` [relation_id](#relation_id) [edb_path](#edb_path) [edb_types](#edb_types) `)` + +### betree_relation + +A B-epsilon-tree backed relation with storage configuration and locator info. + + `(` `betree_relation` [relation_id](#relation_id) [betree_info](#betree_info) `)` + +### betree_info + +Storage metadata for a B-tree relation: key/value types and configuration parameters. + + `(` `betree_info` [betree_info_key_types](#betree_info_key_types) [betree_info_value_types](#betree_info_value_types) [config_dict](#config_dict) `)` + +### betree_info_key_types + + `(` `key_types` [type](#type)* `)` + +### betree_info_value_types + + `(` `value_types` [type](#type)* `)` + +### gnf_columns + +A list of GNF (Generalized Normal Form) column definitions. + + `(` `columns` [gnf_column](#gnf_column)* `)` + +### csv_asof + +A timestamp indicating the point-in-time for the CSV data snapshot. + + `(` `asof` STRING `)` + +### csv_data + +A CSV data source with locator, configuration, column definitions, and a snapshot timestamp. + + `(` `csv_data` [csvlocator](#csvlocator) [csv_config](#csv_config) [gnf_columns](#gnf_columns) [csv_asof](#csv_asof) `)` + +### csv_locator_paths + + `(` `paths` STRING* `)` + +### csv_locator_inline_data + + `(` `inline_data` STRING `)` + +### csvlocator + +Locates CSV data: either by file paths, inline data, or both. + + `(` `csv_locator` [csv_locator_paths](#csv_locator_paths)? [csv_locator_inline_data](#csv_locator_inline_data)? `)` + +### csv_config + +CSV parsing configuration (delimiter, quotechar, escapechar, encoding, etc.). + + `(` `csv_config` [config_dict](#config_dict) `)` + +### gnf_column_path + +The path identifying a column: a single string or a bracketed list of strings for nested paths. + + STRING +| `[` STRING* `]` + +### gnf_column + +A single GNF column definition with a path, optional target relation, and column types. + + `(` `column` [gnf_column_path](#gnf_column_path) [relation_id](#relation_id)? `[` [type](#type)* `]` `)` + +### iceberg_property_entry + + `(` `prop` STRING STRING `)` + +### iceberg_masked_property_entry + + `(` `prop` STRING STRING `)` + +### iceberg_from_snapshot + + `(` `from_snapshot` STRING `)` + +### iceberg_locator_table_name + + `(` `table_name` STRING `)` + +### iceberg_locator_namespace + + `(` `namespace` STRING* `)` + +### iceberg_locator_warehouse + + `(` `warehouse` STRING `)` + +### iceberg_locator + +Identifies an Iceberg table by its name, namespace, and warehouse. + + `(` `iceberg_locator` [iceberg_locator_table_name](#iceberg_locator_table_name) [iceberg_locator_namespace](#iceberg_locator_namespace) [iceberg_locator_warehouse](#iceberg_locator_warehouse) `)` + +### iceberg_catalog_config_scope + + `(` `scope` STRING `)` + +### iceberg_catalog_uri + + `(` `catalog_uri` STRING `)` + +### iceberg_properties + + `(` `properties` [iceberg_property_entry](#iceberg_property_entry)* `)` + +### iceberg_auth_properties + + `(` `auth_properties` [iceberg_masked_property_entry](#iceberg_masked_property_entry)* `)` + +### iceberg_catalog_config + +Configuration for an Iceberg catalog: URI, optional scope, properties, and auth properties. + + `(` `iceberg_catalog_config` [iceberg_catalog_uri](#iceberg_catalog_uri) [iceberg_catalog_config_scope](#iceberg_catalog_config_scope)? [iceberg_properties](#iceberg_properties) [iceberg_auth_properties](#iceberg_auth_properties) `)` + +### iceberg_to_snapshot + + `(` `to_snapshot` STRING `)` + +### iceberg_data + +An Iceberg data source with locator, catalog config, columns, optional snapshot range, +and a flag indicating whether it returns delta data. + + `(` `iceberg_data` [iceberg_locator](#iceberg_locator) [iceberg_catalog_config](#iceberg_catalog_config) [gnf_columns](#gnf_columns) [iceberg_from_snapshot](#iceberg_from_snapshot)? [iceberg_to_snapshot](#iceberg_to_snapshot)? [boolean_value](#boolean_value) `)` + +### undefine + +Removes a previously defined fragment from the database. + + `(` `undefine` [fragment_id](#fragment_id) `)` + +### context + +Declares the context relations that are visible for the current epoch. + + `(` `context` [relation_id](#relation_id)* `)` + +### snapshot_mapping + +Maps a destination EDB path to a source relation for snapshotting. + + [edb_path](#edb_path) [relation_id](#relation_id) + +### snapshot + +Snapshots relations into EDB storage at a given prefix path. + + `(` `snapshot` [edb_path](#edb_path) [snapshot_mapping](#snapshot_mapping)* `)` + +### epoch_reads + +The read section of an epoch, containing zero or more read operations. + + `(` `reads` [read](#read)* `)` + +### read + +A single read operation: demand, output, what-if, abort, or export. + + [demand](#demand) +| [output](#output) +| [what_if](#what_if) +| [abort](#abort) +| [export](#export) + +### demand + +Requests evaluation of a relation without naming the output. + + `(` `demand` [relation_id](#relation_id) `)` + +### output + +Requests evaluation of a relation and assigns a name to the output. + + `(` `output` [name](#name) [relation_id](#relation_id) `)` + +### what_if + +A hypothetical branch: evaluates an epoch in a named sandbox without committing writes. + + `(` `what_if` [name](#name) [epoch](#epoch) `)` + +### abort + +Aborts the transaction if the given relation is non-empty, with an optional name. + + `(` `abort` [name](#name)? [relation_id](#relation_id) `)` + +### export + +Exports data to an external format (CSV or Iceberg). + + `(` `export` [export_csv_config](#export_csv_config) `)` +| `(` `export_iceberg` [export_iceberg_config](#export_iceberg_config) `)` + +### export_csv_config + +Configuration for CSV export. The v2 variant uses a source specification and csv_config; +the legacy variant uses explicit column list and config dict. + + `(` `export_csv_config_v2` [export_csv_path](#export_csv_path) [export_csv_source](#export_csv_source) [csv_config](#csv_config) `)` +| `(` `export_csv_config` [export_csv_path](#export_csv_path) [export_csv_columns_list](#export_csv_columns_list) [config_dict](#config_dict) `)` + +### export_csv_path + + `(` `path` STRING `)` + +### export_csv_columns_list + + `(` `columns` [export_csv_column](#export_csv_column)* `)` + +### export_csv_column + + `(` `column` STRING [relation_id](#relation_id) `)` + +### export_csv_source + +The data source for a v2 CSV export: either explicit GNF columns or a table definition. + + `(` `gnf_columns` [export_csv_column](#export_csv_column)* `)` +| `(` `table_def` [relation_id](#relation_id) `)` + +### export_iceberg_table_def + + `(` `table_def` [relation_id](#relation_id) `)` + +### iceberg_table_properties + + `(` `table_properties` [iceberg_property_entry](#iceberg_property_entry)* `)` + +### export_iceberg_config + +Configuration for Iceberg export: locator, catalog config, table definition, +table properties, and optional additional config. + + `(` `export_iceberg_config` [iceberg_locator](#iceberg_locator) [iceberg_catalog_config](#iceberg_catalog_config) [export_iceberg_table_def](#export_iceberg_table_def) [iceberg_table_properties](#iceberg_table_properties) [config_dict](#config_dict)? `)` diff --git a/meta/src/doc-gen/grammar_to_markdown.py b/meta/src/doc-gen/grammar_to_markdown.py new file mode 100644 index 00000000..a092c7bb --- /dev/null +++ b/meta/src/doc-gen/grammar_to_markdown.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +"""Parse grammar.y and generate a clean S-expression grammar in Markdown. + +Strips construct/deconstruct actions, type annotations, and protobuf details, +leaving only the pure S-expression syntax. + +Usage: + python grammar_to_markdown.py [grammar.y] [output.md] +""" + +import re +import sys +from pathlib import Path + + +def parse_file(path: str) -> tuple[list, dict, list]: + """Parse grammar.y into tokens, aliases, and rules.""" + text = Path(path).read_text() + # Split on %% that appears alone on a line (not inside comments) + sections = re.split(r"^\s*%%\s*$", text, flags=re.MULTILINE) + header = sections[0] + rules_text = sections[1] if len(sections) > 1 else "" + + tokens = [] + aliases = {} + + for line in header.splitlines(): + line = line.strip() + if line.startswith("%token_alias"): + parts = line.split() + aliases[parts[1]] = parts[2] + elif line.startswith("%token ") and not line.startswith("%token_alias"): + parts = line.split(None, 3) + # %token NAME Type PATTERN + name = parts[1] + raw_pattern = parts[3] if len(parts) > 3 else "" + # Strip r'...' or '...' wrapper + pattern = re.sub(r"^r?'(.*)'$", r"\1", raw_pattern) + tokens.append((name, pattern)) + + rules = parse_rules(rules_text) + return tokens, aliases, rules + + +def parse_rules(text: str) -> list[tuple[str, str | None, list[str]]]: + """Extract rule names, doc-comments, and production alternatives. + + A doc-comment is a block of consecutive #-comment lines immediately + preceding a rule name (no blank line in between). + """ + rules = [] + current_rule = None + current_doc = None + current_alts = [] + comment_buf: list[str] = [] + + for line in text.splitlines(): + stripped = line.strip() + + # Blank line: reset comment buffer + if not stripped: + comment_buf = [] + continue + + # Comment line: accumulate in buffer + if stripped.startswith("#"): + comment_buf.append(stripped.lstrip("# ")) + continue + + # Rule name: non-indented identifier + if line and not line[0].isspace(): + if current_rule and current_alts: + rules.append((current_rule, current_doc, current_alts)) + current_rule = stripped + current_doc = "\n".join(comment_buf) if comment_buf else None + current_alts = [] + comment_buf = [] + continue + + # Production line: starts with : or | + if stripped.startswith(":") or stripped.startswith("|"): + prod = stripped[1:].strip() + current_alts.append(prod) + comment_buf = [] + continue + + # Everything else is an action line — skip + comment_buf = [] + + if current_rule and current_alts: + rules.append((current_rule, current_doc, current_alts)) + + return rules + + +def format_production(prod: str, nonterms: set[str]) -> str: + """Format a single production for markdown display. + + Quoted literals like "(" become `(`, non-terminals become links + to their definitions, and modifiers (?, *, +) are preserved. + """ + # Tokenize: quoted strings, symbols with modifiers + result = [] + for tok in re.findall(r'"(?:[^"\\]|\\.)*"[?*+]?|[^\s]+', prod): + if tok.startswith('"'): + # Strip quotes, keep modifier + m = re.match(r'"(.*)"([?*+]?)', tok) + if m: + literal, mod = m.group(1), m.group(2) + result.append(f'`{literal}`{mod}') + else: + # Split off trailing modifier (?, *, +) + m = re.match(r"^([a-zA-Z_][a-zA-Z0-9_]*)([?*+]?)$", tok) + if m and m.group(1) in nonterms: + name, mod = m.group(1), m.group(2) + result.append(f"[{name}](#{name}){mod}") + else: + result.append(tok) + return " ".join(result) + + +def generate_markdown( + tokens: list, aliases: dict, rules: list[tuple[str, str | None, list[str]]] +) -> str: + """Generate a markdown document from parsed grammar components.""" + lines = [] + lines.append("# LQP S-Expression Grammar") + lines.append("") + lines.append( + "This document describes the S-expression syntax of the Logical Query Protocol." + ) + lines.append( + "It is auto-generated from `grammar.y` with construct/deconstruct actions removed." + ) + lines.append("") + + # Tokens + lines.append("## Terminals") + lines.append("") + for name, pattern in tokens: + if pattern: + lines.append(f"- **{name}** — `{pattern}`") + else: + lines.append(f"- **{name}**") + lines.append("") + + if aliases: + lines.append("### Token Aliases (formatted variants)") + lines.append("") + lines.append( + "These are display variants used in the pretty printer;" + " they parse identically to the base token." + ) + lines.append("") + for alias, base in sorted(aliases.items()): + lines.append(f"- **{alias}** → {base}") + lines.append("") + + # Rules + lines.append("## Grammar Rules") + lines.append("") + + nonterms = {name for name, _, _ in rules} + + for rule_name, doc, alts in rules: + lines.append(f"### {rule_name}") + lines.append("") + if doc: + lines.append(doc) + lines.append("") + for i, alt in enumerate(alts): + if i == 0: + lines.append(f" {format_production(alt, nonterms)} ") + else: + lines.append(f"| {format_production(alt, nonterms)} ") + lines.append("") + + return "\n".join(lines) + + +def main(): + grammar_path = sys.argv[1] if len(sys.argv) > 1 else "meta/src/meta/grammar.y" + output_path = sys.argv[2] if len(sys.argv) > 2 else None + + tokens, aliases, rules = parse_file(grammar_path) + md = generate_markdown(tokens, aliases, rules) + + if output_path: + Path(output_path).write_text(md) + print(f"Written to {output_path}") + else: + print(md) + + +if __name__ == "__main__": + main() diff --git a/meta/src/meta/grammar.y b/meta/src/meta/grammar.y index 8f2d743e..52dd5bb5 100644 --- a/meta/src/meta/grammar.y +++ b/meta/src/meta/grammar.y @@ -219,6 +219,8 @@ %% +# The top-level unit of communication. Groups one or more epochs +# with optional configuration and synchronization directives. transaction : "(" "transaction" configure? sync? epoch* ")" construct: $$ = transactions.Transaction(epochs=$5, configure=builtin.unwrap_option_or($3, default_configure()), sync=$4) @@ -227,14 +229,17 @@ transaction $4: Optional[transactions.Sync] = $$.sync if builtin.has_proto_field($$, "sync") else None $5: Sequence[transactions.Epoch] = $$.epochs +# Transaction-level configuration settings (e.g., IVM maintenance level, semantics version). configure : "(" "configure" config_dict ")" construct: $$ = construct_configure($3) deconstruct: $3: Sequence[Tuple[String, logic.Value]] = deconstruct_configure($$) +# A dictionary of key-value pairs enclosed in braces. config_dict : "{" config_key_value* "}" +# A single key-value entry in a config dictionary. The key is a colon-prefixed symbol. config_key_value : ":" SYMBOL raw_value construct: $$ = builtin.tuple($2, $3) @@ -242,6 +247,7 @@ config_key_value $2: String = $$[0] $3: logic.Value = $$[1] +# A typed constant value. Uses formatted token variants for pretty-printable output. value : date construct: $$ = logic.Value(date_value=$1) @@ -294,6 +300,7 @@ value deconstruct if builtin.has_proto_field($$, 'boolean_value'): $1: Boolean = $$.boolean_value +# A typed constant value using raw (unformatted) token variants. raw_value : raw_date construct: $$ = logic.Value(date_value=$1) @@ -346,6 +353,7 @@ raw_value deconstruct if builtin.has_proto_field($$, 'boolean_value'): $1: Boolean = $$.boolean_value +# A date literal with year, month, day components (raw token variant). raw_date : "(" "date" INT INT INT ")" construct: $$ = logic.DateValue(year=builtin.int64_to_int32($3), month=builtin.int64_to_int32($4), day=builtin.int64_to_int32($5)) @@ -354,6 +362,7 @@ raw_date $4: Int64 = builtin.int32_to_int64($$.month) $5: Int64 = builtin.int32_to_int64($$.day) +# A date literal with year, month, day components (formatted token variant). date : "(" "date" FORMATTED_INT FORMATTED_INT FORMATTED_INT ")" construct: $$ = logic.DateValue(year=builtin.int64_to_int32($3), month=builtin.int64_to_int32($4), day=builtin.int64_to_int32($5)) @@ -362,6 +371,8 @@ date $4: Int64 = builtin.int32_to_int64($$.month) $5: Int64 = builtin.int32_to_int64($$.day) +# A datetime literal with year, month, day, hour, minute, second, and optional +# microsecond components (raw token variant). raw_datetime : "(" "datetime" INT INT INT INT INT INT INT? ")" construct: $$ = logic.DateTimeValue(year=builtin.int64_to_int32($3), month=builtin.int64_to_int32($4), day=builtin.int64_to_int32($5), hour=builtin.int64_to_int32($6), minute=builtin.int64_to_int32($7), second=builtin.int64_to_int32($8), microsecond=builtin.int64_to_int32(builtin.unwrap_option_or($9, 0))) @@ -374,6 +385,8 @@ raw_datetime $8: Int64 = builtin.int32_to_int64($$.second) $9: Optional[Int64] = builtin.some(builtin.int32_to_int64($$.microsecond)) +# A datetime literal with year, month, day, hour, minute, second, and optional +# microsecond components (formatted token variant). datetime : "(" "datetime" FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT FORMATTED_INT? ")" construct: $$ = logic.DateTimeValue(year=builtin.int64_to_int32($3), month=builtin.int64_to_int32($4), day=builtin.int64_to_int32($5), hour=builtin.int64_to_int32($6), minute=builtin.int64_to_int32($7), second=builtin.int64_to_int32($8), microsecond=builtin.int64_to_int32(builtin.unwrap_option_or($9, 0))) @@ -386,6 +399,7 @@ datetime $8: Int64 = builtin.int32_to_int64($$.second) $9: Optional[Int64] = builtin.some(builtin.int32_to_int64($$.microsecond)) +# A boolean literal: `true` or `false`. boolean_value : "true" construct: $$ = True @@ -396,16 +410,19 @@ boolean_value deconstruct if not $$: pass +# Synchronization directive listing fragments that must be loaded before evaluation. sync : "(" "sync" fragment_id* ")" construct: $$ = transactions.Sync(fragments=$3) deconstruct: $3: Sequence[fragments.FragmentId] = $$.fragments +# A colon-prefixed identifier for a fragment. fragment_id : ":" SYMBOL construct: $$ = builtin.fragment_id_from_string($2) deconstruct: $2: String = builtin.fragment_id_to_string($$) +# An epoch is a unit of execution within a transaction, containing optional writes and reads. epoch : "(" "epoch" epoch_writes? epoch_reads? ")" construct: $$ = transactions.Epoch(writes=builtin.unwrap_option_or($3, list[transactions.Write]()), reads=builtin.unwrap_option_or($4, list[transactions.Read]())) @@ -413,9 +430,11 @@ epoch $3: Optional[Sequence[transactions.Write]] = $$.writes if not builtin.is_empty($$.writes) else None $4: Optional[Sequence[transactions.Read]] = $$.reads if not builtin.is_empty($$.reads) else None +# The write section of an epoch, containing zero or more write operations. epoch_writes : "(" "writes" write* ")" +# A single write operation: define, undefine, context, or snapshot. write : define construct: $$ = transactions.Write(define=$1) @@ -434,11 +453,13 @@ write deconstruct if builtin.has_proto_field($$, 'snapshot'): $1: transactions.Snapshot = $$.snapshot +# Installs a fragment of declarations into the database. define : "(" "define" fragment ")" construct: $$ = transactions.Define(fragment=$3) deconstruct: $3: fragments.Fragment = $$.fragment +# A named group of declarations (defs, algorithms, constraints, data). fragment : "(" "fragment" new_fragment_id declaration* ")" construct: $$ = builtin.construct_fragment($3, $4) @@ -453,6 +474,7 @@ new_fragment_id builtin.start_fragment($1) $$ = $1 +# A single declaration within a fragment. declaration : def construct: $$ = logic.Declaration(def=$1) @@ -471,6 +493,8 @@ declaration deconstruct if builtin.has_proto_field($$, 'data'): $1: logic.Data = $$.data +# A rule definition: binds a relation name to an abstraction (the rule body), +# with optional attributes. def : "(" "def" relation_id abstraction attrs? ")" construct: $$ = logic.Def(name=$3, body=$4, attrs=builtin.unwrap_option_or($5, list[logic.Attribute]())) @@ -479,6 +503,7 @@ def $4: logic.Abstraction = $$.body $5: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# Identifies a relation, either by a colon-prefixed symbolic name or a numeric hash. relation_id : ":" SYMBOL construct: $$ = builtin.relation_id_from_string($2) @@ -489,6 +514,7 @@ relation_id deconstruct: $1: logic.UInt128Value = deconstruct_relation_id_uint128($$) +# A lambda-like construct: a list of typed variable bindings followed by a formula body. abstraction : "(" bindings formula ")" construct: $$ = logic.Abstraction(vars=builtin.list_concat($2[0], $2[1]), value=$3) @@ -496,6 +522,8 @@ abstraction $2: Tuple[Sequence[logic.Binding], Sequence[logic.Binding]] = deconstruct_bindings($$) $3: logic.Formula = $$.value +# A bracketed list of variable bindings, with an optional value-bindings section +# separated by `|`. bindings : "[" binding* value_bindings? "]" construct: $$ = builtin.tuple($2, builtin.unwrap_option_or($3, list[logic.Binding]())) @@ -503,6 +531,7 @@ bindings $2: Sequence[logic.Binding] = $$[0] $3: Optional[Sequence[logic.Binding]] = $$[1] if not builtin.is_empty($$[1]) else None +# A single typed variable binding: `name :: type`. binding : SYMBOL "::" type construct: $$ = logic.Binding(var=logic.Var(name=$1), type=$3) @@ -510,6 +539,7 @@ binding $1: String = $$.var.name $3: logic.Type = $$.type +# A type annotation for a variable binding. type : unspecified_type construct: $$ = logic.Type(unspecified_type=$1) @@ -616,6 +646,7 @@ missing_type : "MISSING" construct: $$ = logic.MissingType() +# A fixed-point decimal type with precision and scale parameters. decimal_type : "(" "DECIMAL" INT INT ")" construct: $$ = logic.DecimalType(precision=builtin.int64_to_int32($3), scale=builtin.int64_to_int32($4)) @@ -627,9 +658,13 @@ boolean_type : "BOOLEAN" construct: $$ = logic.BooleanType() +# The value-bindings section of a bindings list, separated from key bindings by `|`. +# Used to distinguish key and value variables in upsert and monoid operations. value_bindings : "|" binding* +# A logical formula: the body of an abstraction. Can be a conjunction, disjunction, +# negation, existential quantification, reduction, atom, primitive, or other form. formula : true construct: $$ = logic.Formula(conjunction=$1) @@ -684,14 +719,17 @@ formula deconstruct if builtin.has_proto_field($$, 'cast'): $1: logic.Cast = $$.cast +# The trivially true formula (empty conjunction). true : "(" "true" ")" construct: $$ = logic.Conjunction(args=list[logic.Formula]()) +# The trivially false formula (empty disjunction). false : "(" "false" ")" construct: $$ = logic.Disjunction(args=list[logic.Formula]()) +# Existential quantification: introduces locally scoped variables. exists : "(" "exists" bindings formula ")" construct: $$ = logic.Exists(body=logic.Abstraction(vars=builtin.list_concat($3[0], $3[1]), value=$4)) @@ -699,6 +737,8 @@ exists $3: Tuple[Sequence[logic.Binding], Sequence[logic.Binding]] = deconstruct_bindings($$.body) $4: logic.Formula = $$.body.value +# Aggregation: applies a binary operator (op) over the results of a body abstraction, +# with initial seed terms. reduce : "(" "reduce" abstraction abstraction terms ")" construct: $$ = logic.Reduce(op=$3, body=$4, terms=$5) @@ -707,6 +747,7 @@ reduce $4: logic.Abstraction = $$.body $5: Sequence[logic.Term] = $$.terms +# A term is either a variable or a constant value. term : var construct: $$ = logic.Term(var=$1) @@ -717,26 +758,31 @@ term deconstruct if builtin.has_proto_field($$, 'constant'): $1: logic.Value = $$.constant +# A variable reference. var : SYMBOL construct: $$ = logic.Var(name=$1) deconstruct: $1: String = $$.name +# Logical AND of zero or more formulas. conjunction : "(" "and" formula* ")" construct: $$ = logic.Conjunction(args=$3) deconstruct: $3: Sequence[logic.Formula] = $$.args +# Logical OR of zero or more formulas. disjunction : "(" "or" formula* ")" construct: $$ = logic.Disjunction(args=$3) deconstruct: $3: Sequence[logic.Formula] = $$.args +# Logical negation of a formula. not : "(" "not" formula ")" construct: $$ = logic.Not(arg=$3) deconstruct: $3: logic.Formula = $$.arg +# A foreign function interface call with a name, abstraction arguments, and terms. ffi : "(" "ffi" name ffi_args terms ")" construct: $$ = logic.FFI(name=$3, args=$4, terms=$5) @@ -745,15 +791,19 @@ ffi $4: Sequence[logic.Abstraction] = $$.args $5: Sequence[logic.Term] = $$.terms +# The argument abstractions of an FFI call. ffi_args : "(" "args" abstraction* ")" +# A parenthesized list of terms. terms : "(" "terms" term* ")" +# A colon-prefixed symbolic name. name : ":" SYMBOL +# A relational atom: applies a named relation to a list of terms. atom : "(" "atom" relation_id term* ")" construct: $$ = logic.Atom(name=$3, terms=$4) @@ -761,6 +811,7 @@ atom $3: logic.RelationId = $$.name $4: Sequence[logic.Term] = $$.terms +# A compiler pragma: a named directive with term arguments. pragma : "(" "pragma" name term* ")" construct: $$ = logic.Pragma(name=$3, terms=$4) @@ -768,6 +819,8 @@ pragma $3: String = $$.name $4: Sequence[logic.Term] = $$.terms +# A built-in primitive operation. Includes syntactic sugar for common comparisons +# and arithmetic (`=`, `<`, `+`, etc.) as well as a generic named form. primitive : eq | lt @@ -851,6 +904,7 @@ divide $4: logic.Term = $$.terms[1].term $5: logic.Term = $$.terms[2].term +# A relational term: either a regular term or a specialized (hash-prefixed) constant value. rel_term : specialized_value construct: $$ = logic.RelTerm(specialized_value=$1) @@ -861,9 +915,11 @@ rel_term deconstruct if builtin.has_proto_field($$, 'term'): $1: logic.Term = $$.term +# A hash-prefixed constant value used for type specialization in primitives and rel_atoms. specialized_value : "#" raw_value +# A relational atom with support for specialized value terms. rel_atom : "(" "relatom" name rel_term* ")" construct: $$ = logic.RelAtom(name=$3, terms=$4) @@ -871,6 +927,7 @@ rel_atom $3: String = $$.name $4: Sequence[logic.RelTerm] = $$.terms +# A type cast from an input term to a result term. cast : "(" "cast" term term ")" construct: $$ = logic.Cast(input=$3, result=$4) @@ -878,9 +935,11 @@ cast $3: logic.Term = $$.input $4: logic.Term = $$.result +# A list of attributes attached to a def, algorithm, or instruction. attrs : "(" "attrs" attribute* ")" +# A single named attribute with zero or more value arguments. attribute : "(" "attribute" name raw_value* ")" construct: $$ = logic.Attribute(name=$3, args=$4) @@ -888,6 +947,8 @@ attribute $3: String = $$.name $4: Sequence[logic.Value] = $$.args +# An imperative algorithm declaration with global relation references, a script body, +# and optional attributes. algorithm : "(" "algorithm" relation_id* script attrs? ")" construct: $$ = logic.Algorithm(global=$3, body=$4, attrs=builtin.unwrap_option_or($5, list[logic.Attribute]())) @@ -896,11 +957,13 @@ algorithm $4: logic.Script = $$.body $5: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# The body of an algorithm: a sequence of constructs (loops and instructions). script : "(" "script" construct* ")" construct: $$ = logic.Script(constructs=$3) deconstruct: $3: Sequence[logic.Construct] = $$.constructs +# A single construct within a script: either a loop or an instruction. construct : loop construct: $$ = logic.Construct(loop=$1) @@ -911,6 +974,7 @@ construct deconstruct if builtin.has_proto_field($$, 'instruction'): $1: logic.Instruction = $$.instruction +# A loop construct with initialization instructions and a script body. loop : "(" "loop" init script attrs? ")" construct: $$ = logic.Loop(init=$3, body=$4, attrs=builtin.unwrap_option_or($5, list[logic.Attribute]())) @@ -919,9 +983,11 @@ loop $4: logic.Script = $$.body $5: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# The initialization block of a loop. init : "(" "init" instruction* ")" +# A single imperative instruction within a script or loop. instruction : assign construct: $$ = logic.Instruction(assign=$1) @@ -944,6 +1010,7 @@ instruction deconstruct if builtin.has_proto_field($$, 'monus_def'): $1: logic.MonusDef = $$.monus_def +# Assigns a relation to the result of an abstraction (replaces existing tuples). assign : "(" "assign" relation_id abstraction attrs? ")" construct: $$ = logic.Assign(name=$3, body=$4, attrs=builtin.unwrap_option_or($5, list[logic.Attribute]())) @@ -952,6 +1019,7 @@ assign $4: logic.Abstraction = $$.body $5: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# Merges tuples into a relation using a monoid-based update (insert or update existing). upsert : "(" "upsert" relation_id abstraction_with_arity attrs? ")" construct: $$ = logic.Upsert(name=$3, body=$4[0], attrs=builtin.unwrap_option_or($5, list[logic.Attribute]()), value_arity=$4[1]) @@ -960,6 +1028,8 @@ upsert $4: Tuple[logic.Abstraction, Int64] = builtin.tuple($$.body, $$.value_arity) $5: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# An abstraction that distinguishes key bindings from value bindings via the `|` separator. +# The value arity is derived from the number of value bindings. abstraction_with_arity : "(" bindings formula ")" construct: $$ = builtin.tuple(logic.Abstraction(vars=builtin.list_concat($2[0], $2[1]), value=$3), builtin.length($2[1])) @@ -967,6 +1037,7 @@ abstraction_with_arity $2: Tuple[Sequence[logic.Binding], Sequence[logic.Binding]] = deconstruct_bindings_with_arity($$[0], $$[1]) $3: logic.Formula = $$[0].value +# A loop termination condition: breaks when the relation matches the abstraction. break : "(" "break" relation_id abstraction attrs? ")" construct: $$ = logic.Break(name=$3, body=$4, attrs=builtin.unwrap_option_or($5, list[logic.Attribute]())) @@ -975,6 +1046,7 @@ break $4: logic.Abstraction = $$.body $5: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# Defines an aggregation over a relation using a monoid (or, min, max, sum). monoid_def : "(" "monoid" monoid relation_id abstraction_with_arity attrs? ")" construct: $$ = logic.MonoidDef(monoid=$3, name=$4, body=$5[0], attrs=builtin.unwrap_option_or($6, list[logic.Attribute]()), value_arity=$5[1]) @@ -984,6 +1056,7 @@ monoid_def $5: Tuple[logic.Abstraction, Int64] = builtin.tuple($$.body, $$.value_arity) $6: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# The type of aggregation monoid. monoid : or_monoid construct: $$ = logic.Monoid(or_monoid=$1) @@ -1021,6 +1094,7 @@ sum_monoid construct: $$ = logic.SumMonoid(type=$3) deconstruct: $3: logic.Type = $$.type +# Defines a monus (subtraction) operation over a relation using a monoid. monus_def : "(" "monus" monoid relation_id abstraction_with_arity attrs? ")" construct: $$ = logic.MonusDef(monoid=$3, name=$4, body=$5[0], attrs=builtin.unwrap_option_or($6, list[logic.Attribute]()), value_arity=$5[1]) @@ -1030,6 +1104,8 @@ monus_def $5: Tuple[logic.Abstraction, Int64] = builtin.tuple($$.body, $$.value_arity) $6: Optional[Sequence[logic.Attribute]] = $$.attrs if not builtin.is_empty($$.attrs) else None +# A functional dependency constraint on a relation: given the key variables, +# the value variables are uniquely determined. constraint : "(" "functional_dependency" relation_id abstraction functional_dependency_keys functional_dependency_values ")" construct: $$ = logic.Constraint(name=$3, functional_dependency=logic.FunctionalDependency(guard=$4, keys=$5, values=$6)) @@ -1045,6 +1121,7 @@ functional_dependency_keys functional_dependency_values : "(" "values" var* ")" +# A data source declaration: external data (EDB), B-tree, CSV, or Iceberg. data : edb construct: $$ = logic.Data(edb=$1) @@ -1069,6 +1146,8 @@ edb_path edb_types : "[" type* "]" +# An extensional database (EDB) declaration: maps a relation to stored data at a given +# path with specified column types. edb : "(" "edb" relation_id edb_path edb_types ")" construct: $$ = logic.EDB(target_id=$3, path=$4, types=$5) @@ -1077,6 +1156,7 @@ edb $4: Sequence[String] = $$.path $5: Sequence[logic.Type] = $$.types +# A B-epsilon-tree backed relation with storage configuration and locator info. betree_relation : "(" "betree_relation" relation_id betree_info ")" construct: $$ = logic.BeTreeRelation(name=$3, relation_info=$4) @@ -1084,6 +1164,7 @@ betree_relation $3: logic.RelationId = $$.name $4: logic.BeTreeInfo = $$.relation_info +# Storage metadata for a B-tree relation: key/value types and configuration parameters. betree_info : "(" "betree_info" betree_info_key_types betree_info_value_types config_dict ")" construct: $$ = construct_betree_info($3, $4, $5) @@ -1098,12 +1179,15 @@ betree_info_key_types betree_info_value_types : "(" "value_types" type* ")" +# A list of GNF (Generalized Normal Form) column definitions. gnf_columns : "(" "columns" gnf_column* ")" +# A timestamp indicating the point-in-time for the CSV data snapshot. csv_asof : "(" "asof" STRING ")" +# A CSV data source with locator, configuration, column definitions, and a snapshot timestamp. csv_data : "(" "csv_data" csvlocator csv_config gnf_columns csv_asof ")" construct: $$ = logic.CSVData(locator=$3, config=$4, columns=$5, asof=$6) @@ -1119,6 +1203,7 @@ csv_locator_paths csv_locator_inline_data : "(" "inline_data" STRING ")" +# Locates CSV data: either by file paths, inline data, or both. csvlocator : "(" "csv_locator" csv_locator_paths? csv_locator_inline_data? ")" construct: $$ = logic.CSVLocator(paths=builtin.unwrap_option_or($3, list[str]()), inline_data=builtin.encode_string(builtin.unwrap_option_or($4, ""))) @@ -1126,11 +1211,13 @@ csvlocator $3: Optional[Sequence[String]] = $$.paths if not builtin.is_empty($$.paths) else None $4: Optional[String] = builtin.decode_string($$.inline_data) if builtin.decode_string($$.inline_data) != "" else None +# CSV parsing configuration (delimiter, quotechar, escapechar, encoding, etc.). csv_config : "(" "csv_config" config_dict ")" construct: $$ = construct_csv_config($3) deconstruct: $3: Sequence[Tuple[String, logic.Value]] = deconstruct_csv_config($$) +# The path identifying a column: a single string or a bracketed list of strings for nested paths. gnf_column_path : STRING construct: $$ = [$1] @@ -1141,6 +1228,7 @@ gnf_column_path deconstruct if builtin.length($$) != 1: $2: Sequence[String] = $$ +# A single GNF column definition with a path, optional target relation, and column types. gnf_column : "(" "column" gnf_column_path relation_id? "[" type* "]" ")" construct: $$ = logic.GNFColumn(column_path=$3, target_id=$4, types=$6) @@ -1183,6 +1271,7 @@ iceberg_locator_warehouse construct: $$ = $3 deconstruct: $3: String = $$ +# Identifies an Iceberg table by its name, namespace, and warehouse. iceberg_locator : "(" "iceberg_locator" iceberg_locator_table_name iceberg_locator_namespace iceberg_locator_warehouse ")" construct: $$ = logic.IcebergLocator(table_name=$3, namespace=$4, warehouse=$5) @@ -1211,6 +1300,7 @@ iceberg_auth_properties construct: $$ = $3 deconstruct: $3: Sequence[Tuple[String, String]] = $$ +# Configuration for an Iceberg catalog: URI, optional scope, properties, and auth properties. iceberg_catalog_config : "(" "iceberg_catalog_config" iceberg_catalog_uri iceberg_catalog_config_scope? iceberg_properties iceberg_auth_properties ")" construct: $$ = construct_iceberg_catalog_config($3, $4, $5, $6) @@ -1225,6 +1315,8 @@ iceberg_to_snapshot construct: $$ = $3 deconstruct: $3: String = $$ +# An Iceberg data source with locator, catalog config, columns, optional snapshot range, +# and a flag indicating whether it returns delta data. iceberg_data : "(" "iceberg_data" iceberg_locator iceberg_catalog_config gnf_columns iceberg_from_snapshot? iceberg_to_snapshot? boolean_value ")" construct: $$ = construct_iceberg_data($3, $4, $5, $6, $7, $8) @@ -1236,16 +1328,19 @@ iceberg_data $7: Optional[String] = deconstruct_iceberg_data_to_snapshot_optional($$) $8: Boolean = $$.returns_delta +# Removes a previously defined fragment from the database. undefine : "(" "undefine" fragment_id ")" construct: $$ = transactions.Undefine(fragment_id=$3) deconstruct: $3: fragments.FragmentId = $$.fragment_id +# Declares the context relations that are visible for the current epoch. context : "(" "context" relation_id* ")" construct: $$ = transactions.Context(relations=$3) deconstruct: $3: Sequence[logic.RelationId] = $$.relations +# Maps a destination EDB path to a source relation for snapshotting. snapshot_mapping : edb_path relation_id construct: $$ = transactions.SnapshotMapping(destination_path=$1, source_relation=$2) @@ -1253,6 +1348,7 @@ snapshot_mapping $1: Sequence[String] = $$.destination_path $2: logic.RelationId = $$.source_relation +# Snapshots relations into EDB storage at a given prefix path. snapshot : "(" "snapshot" edb_path snapshot_mapping* ")" construct: $$ = transactions.Snapshot(prefix=$3, mappings=$4) @@ -1260,9 +1356,11 @@ snapshot $3: Sequence[String] = $$.prefix $4: Sequence[transactions.SnapshotMapping] = $$.mappings +# The read section of an epoch, containing zero or more read operations. epoch_reads : "(" "reads" read* ")" +# A single read operation: demand, output, what-if, abort, or export. read : demand construct: $$ = transactions.Read(demand=$1) @@ -1285,11 +1383,13 @@ read deconstruct if builtin.has_proto_field($$, 'export'): $1: transactions.Export = $$.export +# Requests evaluation of a relation without naming the output. demand : "(" "demand" relation_id ")" construct: $$ = transactions.Demand(relation_id=$3) deconstruct: $3: logic.RelationId = $$.relation_id +# Requests evaluation of a relation and assigns a name to the output. output : "(" "output" name relation_id ")" construct: $$ = transactions.Output(name=$3, relation_id=$4) @@ -1297,6 +1397,7 @@ output $3: String = $$.name $4: logic.RelationId = $$.relation_id +# A hypothetical branch: evaluates an epoch in a named sandbox without committing writes. what_if : "(" "what_if" name epoch ")" construct: $$ = transactions.WhatIf(branch=$3, epoch=$4) @@ -1304,6 +1405,7 @@ what_if $3: String = $$.branch $4: transactions.Epoch = $$.epoch +# Aborts the transaction if the given relation is non-empty, with an optional name. abort : "(" "abort" name? relation_id ")" construct: $$ = transactions.Abort(name=builtin.unwrap_option_or($3, "abort"), relation_id=$4) @@ -1311,6 +1413,7 @@ abort $3: Optional[String] = $$.name if $$.name != "abort" else None $4: logic.RelationId = $$.relation_id +# Exports data to an external format (CSV or Iceberg). export : "(" "export" export_csv_config ")" construct: $$ = transactions.Export(csv_config=$3) @@ -1321,6 +1424,8 @@ export deconstruct if builtin.has_proto_field($$, 'iceberg_config'): $3: transactions.ExportIcebergConfig = $$.iceberg_config +# Configuration for CSV export. The v2 variant uses a source specification and csv_config; +# the legacy variant uses explicit column list and config dict. export_csv_config : "(" "export_csv_config_v2" export_csv_path export_csv_source csv_config ")" construct: $$ = construct_export_csv_config_with_source($3, $4, $5) @@ -1348,6 +1453,7 @@ export_csv_column $3: String = $$.column_name $4: logic.RelationId = $$.column_data +# The data source for a v2 CSV export: either explicit GNF columns or a table definition. export_csv_source : "(" "gnf_columns" export_csv_column* ")" construct: $$ = transactions.ExportCSVSource(gnf_columns=transactions.ExportCSVColumns(columns=$3)) @@ -1368,6 +1474,8 @@ iceberg_table_properties construct: $$ = $3 deconstruct: $3: Sequence[Tuple[String, String]] = $$ +# Configuration for Iceberg export: locator, catalog config, table definition, +# table properties, and optional additional config. export_iceberg_config : "(" "export_iceberg_config" iceberg_locator iceberg_catalog_config export_iceberg_table_def iceberg_table_properties config_dict? ")" construct: $$ = construct_export_iceberg_config_full($3, $4, $5, $6, $7)