diff --git a/binoc-core/src/config.rs b/binoc-core/src/config.rs index ee6ccf6..43919f8 100644 --- a/binoc-core/src/config.rs +++ b/binoc-core/src/config.rs @@ -111,6 +111,7 @@ impl DatasetConfig { "binoc.folder_move_detector".into(), "binoc.table_splitter".into(), "binoc.tabular_analyzer".into(), + "binoc.tabular_stats_annotator".into(), "binoc.column_reorder_detector".into(), "binoc.table_collection_analyzer".into(), ], diff --git a/binoc-core/src/controller.rs b/binoc-core/src/controller.rs index 7720e1b..97ace41 100644 --- a/binoc-core/src/controller.rs +++ b/binoc-core/src/controller.rs @@ -573,8 +573,8 @@ impl Controller { /// re-dispatch the pair through the comparator pipeline, then merge the /// resulting `item_type`, `comparator`, `source_items`, `artifacts`, /// `details`, and `children` into the host node. The comparator's own - /// summary (e.g. "2 lines added") is stashed in - /// `annotations.content_summary` so renderers can surface it without + /// summary (e.g. "2 lines added") is stashed in the `content_summary` + /// annotation so renderers can surface it without /// overwriting the host's move headline. /// /// `pending_recompare` is `take()`n (cleared) on every visited node, @@ -642,9 +642,9 @@ impl Controller { if node.summary.is_none() { node.summary = Some(summary.clone()); } - node.annotations - .entry("content_summary".into()) - .or_insert_with(|| serde_json::json!(summary)); + if node.binoc_annotation("content_summary").is_none() { + node.annotate_from("binoc", "content_summary", serde_json::json!(summary)); + } } } // Splice point: future non-Root transformers that need same-pass diff --git a/binoc-python/python/binoc/_binoc.pyi b/binoc-python/python/binoc/_binoc.pyi index 3627bbf..a0d3a27 100644 --- a/binoc-python/python/binoc/_binoc.pyi +++ b/binoc-python/python/binoc/_binoc.pyi @@ -2,6 +2,8 @@ from __future__ import annotations from typing import Any, Iterator +AnnotationRecord = dict[str, Any] + class DiffNode: def __init__( self, @@ -13,7 +15,7 @@ class DiffNode: summary: str | None = None, tags: list[str] | set[str] | None = None, details: dict[str, Any] | None = None, - annotations: dict[str, Any] | None = None, + annotations: list[AnnotationRecord] | None = None, children: list[DiffNode] | None = None, ) -> None: ... @property @@ -33,7 +35,7 @@ class DiffNode: @property def details(self) -> dict[str, Any]: ... @property - def annotations(self) -> dict[str, Any]: ... + def annotations(self) -> list[AnnotationRecord]: ... def node_count(self) -> int: ... def all_tags(self) -> list[str]: ... def to_dict(self) -> dict[str, Any]: ... @@ -43,6 +45,8 @@ class DiffNode: def with_source_path(self, source: str) -> DiffNode: ... def with_children(self, children: list[DiffNode]) -> DiffNode: ... def with_detail(self, key: str, value: Any) -> DiffNode: ... + def with_annotation_from(self, package: str, key: str, value: Any) -> DiffNode: ... + def annotate_from(self, package: str, key: str, value: Any) -> DiffNode: ... def find_node(self, selector: str) -> DiffNode | None: ... def __repr__(self) -> str: ... def __str__(self) -> str: ... diff --git a/binoc-python/src/lib.rs b/binoc-python/src/lib.rs index 6d03d48..2e17759 100644 --- a/binoc-python/src/lib.rs +++ b/binoc-python/src/lib.rs @@ -502,6 +502,58 @@ fn py_dict_to_json_map(dict: &Bound<'_, PyDict>) -> PyResult(py: Python<'py>, annotation: &Annotation) -> PyResult> { + let dict = PyDict::new(py); + dict.set_item("package", &annotation.package)?; + dict.set_item("key", &annotation.key)?; + dict.set_item("value", json_to_py(py, &annotation.value)?)?; + Ok(dict) +} + +fn annotations_to_py<'py>( + py: Python<'py>, + annotations: &[Annotation], +) -> PyResult> { + let items: PyResult>> = annotations + .iter() + .map(|annotation| annotation_to_py(py, annotation)) + .collect(); + PyList::new(py, items?) +} + +fn py_annotation_record_to_ir(dict: &Bound<'_, PyDict>) -> PyResult { + let package = dict + .get_item("package")? + .map(|value| value.extract::()) + .transpose()? + .unwrap_or_else(|| "binoc".to_string()); + let key = dict + .get_item("key")? + .ok_or_else(|| PyTypeError::new_err("annotation record missing 'key'"))? + .extract::()?; + let value = dict + .get_item("value")? + .ok_or_else(|| PyTypeError::new_err("annotation record missing 'value'")) + .and_then(|value| py_to_json(&value))?; + Ok(Annotation::new(package, key, value)) +} + +fn py_annotations_to_ir(obj: &Bound<'_, PyAny>) -> PyResult> { + if let Ok(list) = obj.cast::() { + let mut annotations = Vec::new(); + for item in list.iter() { + let dict = item + .cast::() + .map_err(|_| PyTypeError::new_err("annotation list items must be dicts"))?; + annotations.push(py_annotation_record_to_ir(dict)?); + } + return Ok(annotations); + } + Err(PyTypeError::new_err( + "annotations must be a list of annotation dicts", + )) +} + // ═══════════════════════════════════════════════════════════════════════════ // PyDiffNode // ═══════════════════════════════════════════════════════════════════════════ @@ -540,7 +592,8 @@ impl PyDiffNode { /// :param tags: Optional list or set of open-string tags (used for /// renderer significance classification and transformer dispatch). /// :param details: Optional dict of structured JSON-serializable data. - /// :param annotations: Optional dict of transient/presentation data. + /// :param annotations: Optional list of annotation dicts with explicit + /// ``package``, ``key``, and ``value`` fields. /// :param children: Optional list of child ``DiffNode`` s. #[new] #[pyo3(signature = (action, item_type, path, *, source_path=None, summary=None, tags=None, details=None, annotations=None, children=None))] @@ -553,7 +606,7 @@ impl PyDiffNode { summary: Option, tags: Option>, details: Option>, - annotations: Option>, + annotations: Option>, children: Option>, ) -> PyResult { let mut node = DiffNode::new(action, item_type, path); @@ -576,7 +629,7 @@ impl PyDiffNode { node.details = py_dict_to_json_map(&d)?; } if let Some(a) = annotations { - node.annotations = py_dict_to_json_map(&a)?; + node.annotations = py_annotations_to_ir(&a)?; } if let Some(c) = children { node.children = c.into_iter().map(|n| n.inner).collect(); @@ -629,10 +682,11 @@ impl PyDiffNode { fn details<'py>(&self, py: Python<'py>) -> PyResult> { json_map_to_py(py, &self.inner.details) } - /// Transient/presentation annotations not part of the persisted IR. + /// Renderer-visible annotations as ``{"package", "key", "value"}`` + /// records. #[getter] - fn annotations<'py>(&self, py: Python<'py>) -> PyResult> { - json_map_to_py(py, &self.inner.annotations) + fn annotations<'py>(&self, py: Python<'py>) -> PyResult> { + annotations_to_py(py, &self.inner.annotations) } /// Total number of nodes in the subtree rooted at this node. @@ -661,7 +715,10 @@ impl PyDiffNode { .collect(); dict.set_item("children", PyList::new(py, children?)?)?; dict.set_item("details", json_map_to_py(py, &self.inner.details)?)?; - dict.set_item("annotations", json_map_to_py(py, &self.inner.annotations)?)?; + dict.set_item( + "annotations", + annotations_to_py(py, &self.inner.annotations)?, + )?; Ok(dict) } @@ -704,6 +761,31 @@ impl PyDiffNode { inner: self.inner.clone().with_detail(key, json_val), }) } + /// Return a clone of this node with a namespaced annotation set. + /// ``value`` must be JSON-serializable. + fn with_annotation_from( + &self, + package: String, + key: String, + value: Bound<'_, PyAny>, + ) -> PyResult { + let json_val = py_to_json(&value)?; + Ok(Self { + inner: self + .inner + .clone() + .with_annotation_from(package, key, json_val), + }) + } + /// Alias for ``with_annotation_from``. + fn annotate_from( + &self, + package: String, + key: String, + value: Bound<'_, PyAny>, + ) -> PyResult { + self.with_annotation_from(package, key, value) + } /// Recursively search this subtree for a node whose ``path`` matches /// ``selector``. Returns ``None`` if no match is found. fn find_node(&self, selector: &str) -> Option { diff --git a/binoc-python/tests/test_ir.py b/binoc-python/tests/test_ir.py index 9045825..b053f10 100644 --- a/binoc-python/tests/test_ir.py +++ b/binoc-python/tests/test_ir.py @@ -56,6 +56,43 @@ def test_with_detail(self): updated = node.with_detail('key', 'value') assert updated.details['key'] == 'value' + def test_annotations_are_namespaced_records(self): + node = binoc.DiffNode( + 'modify', + 'file', + 'f', + annotations=[{'package': 'binoc', 'key': 'note', 'value': 'check me'}], + ) + assert node.annotations == [{'package': 'binoc', 'key': 'note', 'value': 'check me'}] + + updated = node.annotate_from('binoc', 'distribution_shifts', ['score changed']) + updated = updated.annotate_from('example.plugin', 'warning', {'level': 2}) + assert updated.annotations == [ + {'package': 'binoc', 'key': 'note', 'value': 'check me'}, + { + 'package': 'binoc', + 'key': 'distribution_shifts', + 'value': ['score changed'], + }, + { + 'package': 'example.plugin', + 'key': 'warning', + 'value': {'level': 2}, + }, + ] + assert node.annotations == [{'package': 'binoc', 'key': 'note', 'value': 'check me'}] + + def test_annotation_records_can_be_constructed_explicitly(self): + node = binoc.DiffNode( + 'modify', + 'file', + 'f', + annotations=[{'package': 'example.plugin', 'key': 'note', 'value': ['one', 'two']}], + ) + assert node.to_dict()['annotations'] == [ + {'package': 'example.plugin', 'key': 'note', 'value': ['one', 'two']} + ] + def test_with_source_path(self): node = binoc.DiffNode('move', 'file', 'new.txt') moved = node.with_source_path('old.txt') diff --git a/binoc-sdk/src/ir.rs b/binoc-sdk/src/ir.rs index e291702..9054443 100644 --- a/binoc-sdk/src/ir.rs +++ b/binoc-sdk/src/ir.rs @@ -61,6 +61,39 @@ impl Diagnostic { } } +/// Renderer-visible metadata attached to a diff node by a comparator or +/// transformer. +/// +/// Annotations are intentionally progressively typed: producers can start with +/// a string or simple JSON value, and renderers can either display the generic +/// value shape or add package/key-specific handling later. The package namespace +/// keeps independently-authored plugins from colliding on common keys. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct Annotation { + pub package: String, + pub key: String, + pub value: serde_json::Value, +} + +impl Annotation { + pub fn new( + package: impl Into, + key: impl Into, + value: serde_json::Value, + ) -> Self { + Self { + package: package.into(), + key: key.into(), + value, + } + } + + pub fn as_str(&self) -> Option<&str> { + self.value.as_str() + } +} + /// A node in the diff tree — the central data structure of the system. /// Every comparator emits it, every transformer rewrites it, and serializers /// or bindings read it. @@ -107,9 +140,9 @@ pub struct DiffNode { #[serde(default, skip_serializing_if = "Vec::is_empty")] pub detail_blocks: Vec, - /// Transformer-added metadata. - #[serde(default, skip_serializing_if = "BTreeMap::is_empty")] - pub annotations: BTreeMap, + /// Renderer-visible annotations supplied by comparators or transformers. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub annotations: Vec, /// Which comparator produced this node (provenance for extract chain). #[serde(default, skip_serializing_if = "Option::is_none")] @@ -175,7 +208,7 @@ impl DiffNode { children: Vec::new(), details: BTreeMap::new(), detail_blocks: Vec::new(), - annotations: BTreeMap::new(), + annotations: Vec::new(), comparator: None, transformed_by: Vec::new(), source_items: None, @@ -210,6 +243,16 @@ impl DiffNode { self } + pub fn with_annotation_from( + mut self, + package: impl Into, + key: impl Into, + value: serde_json::Value, + ) -> Self { + self.annotate_from(package, key, value); + self + } + pub fn with_source_path(mut self, source: impl Into) -> Self { self.source_path = Some(source.into()); self @@ -239,6 +282,35 @@ impl DiffNode { self.diagnostics.push(diagnostic.normalized()); } + pub fn annotate_from( + &mut self, + package: impl Into, + key: impl Into, + value: serde_json::Value, + ) { + let package = package.into(); + let key = key.into(); + if let Some(existing) = self + .annotations + .iter_mut() + .find(|annotation| annotation.package == package && annotation.key == key) + { + existing.value = value; + } else { + self.annotations.push(Annotation::new(package, key, value)); + } + } + + pub fn annotation(&self, package: &str, key: &str) -> Option<&Annotation> { + self.annotations + .iter() + .find(|annotation| annotation.package == package && annotation.key == key) + } + + pub fn binoc_annotation(&self, key: &str) -> Option<&Annotation> { + self.annotation("binoc", key) + } + pub fn node_count(&self) -> usize { 1 + self.children.iter().map(|c| c.node_count()).sum::() } @@ -495,6 +567,7 @@ mod tests { .with_tag("binoc.column-reorder") .with_tag("binoc.whitespace") .with_detail("lines_changed", serde_json::json!(42)) + .with_annotation_from("binoc", "note", serde_json::json!("check distribution")) .with_children(vec![child]) .with_source_path("old/dir"); @@ -505,12 +578,37 @@ mod tests { node.details.get("lines_changed"), Some(&serde_json::json!(42)) ); + assert_eq!( + node.binoc_annotation("note") + .map(|annotation| &annotation.value), + Some(&serde_json::json!("check distribution")) + ); assert!(node.detail_blocks.is_empty()); assert_eq!(node.children.len(), 1); assert_eq!(node.children[0].path, "child.txt"); assert_eq!(node.source_path.as_deref(), Some("old/dir")); } + #[test] + fn annotations_are_namespaced_and_replace_by_package_key() { + let mut node = DiffNode::new("modify", "file", "data.csv"); + node.annotate_from("binoc", "note", serde_json::json!("first")); + node.annotate_from("binoc", "note", serde_json::json!("second")); + node.annotate_from("example.plugin", "note", serde_json::json!("external")); + + assert_eq!(node.annotations.len(), 2); + assert_eq!( + node.binoc_annotation("note") + .map(|annotation| &annotation.value), + Some(&serde_json::json!("second")) + ); + assert_eq!( + node.annotation("example.plugin", "note") + .map(|annotation| &annotation.value), + Some(&serde_json::json!("external")) + ); + } + #[test] fn node_count_leaf_returns_one() { let node = DiffNode::new("add", "file", "file.txt"); diff --git a/binoc-sdk/src/lib.rs b/binoc-sdk/src/lib.rs index 5c35aac..62ba542 100644 --- a/binoc-sdk/src/lib.rs +++ b/binoc-sdk/src/lib.rs @@ -9,8 +9,8 @@ pub mod test_support; pub use data_access::LocalDataAccess; pub use ir::{ - Changeset, DetailBlock, DetailExample, Diagnostic, DiagnosticSeverity, DiffNode, ExtractHint, - ValuePreview, + Annotation, Changeset, DetailBlock, DetailExample, Diagnostic, DiagnosticSeverity, DiffNode, + ExtractHint, ValuePreview, }; pub use traits::*; pub use types::*; diff --git a/binoc-stdlib/src/lib.rs b/binoc-stdlib/src/lib.rs index bb20583..95da53c 100644 --- a/binoc-stdlib/src/lib.rs +++ b/binoc-stdlib/src/lib.rs @@ -32,6 +32,9 @@ pub fn register_stdlib(registry: &mut PluginRegistry) { ))); r(registry.register_transformer(Arc::new(transformers::table_splitter::TableSplitter))); r(registry.register_transformer(Arc::new(transformers::tabular_analyzer::TabularAnalyzer))); + r(registry.register_transformer(Arc::new( + transformers::tabular_stats_annotator::TabularStatsAnnotator, + ))); r(registry.register_transformer(Arc::new( transformers::column_reorder::ColumnReorderDetector, ))); diff --git a/binoc-stdlib/src/renderers/markdown.rs b/binoc-stdlib/src/renderers/markdown.rs index 3734a03..4b6b1e3 100644 --- a/binoc-stdlib/src/renderers/markdown.rs +++ b/binoc-stdlib/src/renderers/markdown.rs @@ -293,6 +293,7 @@ fn format_node( } } + render_annotations(out, node, config, detail_budget); render_detail_blocks(out, node, path, config, detail_budget); } @@ -331,13 +332,139 @@ fn move_trailer(node: &DiffNode) -> Option { } fn annotation_str(node: &DiffNode, key: &str) -> Option { - node.annotations - .get(key) - .and_then(|v| v.as_str()) + node.binoc_annotation(key) + .and_then(Annotation::as_str) .filter(|s| !s.is_empty()) .map(|s| s.to_string()) } +fn render_annotations( + out: &mut String, + node: &DiffNode, + config: &MarkdownRendererConfig, + detail_budget: &mut DetailBudget, +) { + if config.verbosity == Verbosity::Summary { + return; + } + for annotation in node.annotations.iter().filter(|annotation| { + !(annotation.package == "binoc" + && matches!( + annotation.key.as_str(), + "content_summary" | "tabular_summary" + )) + }) { + if !render_annotation(out, annotation, config, detail_budget) { + return; + } + } +} + +fn render_annotation( + out: &mut String, + annotation: &Annotation, + config: &MarkdownRendererConfig, + detail_budget: &mut DetailBudget, +) -> bool { + let label = annotation_label(annotation); + match &annotation.value { + serde_json::Value::Null => detail_budget.push_line(out, format!(" - {label}: null\n")), + serde_json::Value::Bool(value) => { + detail_budget.push_line(out, format!(" - {label}: {value}\n")) + } + serde_json::Value::Number(value) => { + detail_budget.push_line(out, format!(" - {label}: {value}\n")) + } + serde_json::Value::String(value) => { + let (value, truncated) = truncate_text(value, config.max_value_chars); + detail_budget.push_line( + out, + format!( + " - {label}: {}{}\n", + value, + if truncated { "..." } else { "" } + ), + ) + } + serde_json::Value::Array(values) if values.iter().all(|value| value.as_str().is_some()) => { + render_string_list_annotation(out, &label, values, config, detail_budget) + } + value => detail_budget.push_line( + out, + format!( + " - {label}: {}\n", + format_json_annotation_value(value, config) + ), + ), + } +} + +fn render_string_list_annotation( + out: &mut String, + label: &str, + values: &[serde_json::Value], + config: &MarkdownRendererConfig, + detail_budget: &mut DetailBudget, +) -> bool { + let shown = if config.verbosity == Verbosity::Full { + values.len() + } else { + values.len().min(config.max_examples_per_block) + }; + if shown == 0 { + return true; + } + + let mut header = label.to_string(); + if shown < values.len() { + header.push_str(&format!(" (showing {shown} of {})", values.len())); + } + if !detail_budget.push_line(out, format!(" - {header}\n")) { + return false; + } + + for value in values.iter().take(shown).filter_map(|value| value.as_str()) { + let (value, truncated) = truncate_text(value, config.max_value_chars); + if !detail_budget.push_line( + out, + format!(" - {}{}\n", value, if truncated { "..." } else { "" }), + ) { + return false; + } + } + true +} + +fn format_json_annotation_value( + value: &serde_json::Value, + config: &MarkdownRendererConfig, +) -> String { + let raw = serde_json::to_string(value).unwrap_or_else(|_| "null".into()); + let (value, truncated) = truncate_text(&raw, config.max_value_chars); + if truncated { + format!("{value}...") + } else { + value + } +} + +fn annotation_label(annotation: &Annotation) -> String { + let mut label = humanize_annotation_key(&annotation.key); + if annotation.package != "binoc" { + label = format!("{} {label}", annotation.package); + } + label +} + +fn humanize_annotation_key(key: &str) -> String { + let text = key + .split(['_', '-']) + .filter(|part| !part.is_empty()) + .collect::>() + .join(" "); + capitalize(&text) +} + fn fallback_description(node: &DiffNode) -> String { let action = &node.action; let item_type = if node.item_type.is_empty() { @@ -840,8 +967,9 @@ mod tests { .with_tag("binoc.move.modified") .with_tag("binoc.column-addition") .with_tag("binoc.schema-change"); - move_node.annotations.insert( - "tabular_summary".into(), + move_node.annotate_from( + "binoc", + "tabular_summary", serde_json::json!("Column added: 'email'"), ); let root = DiffNode::new("modify", "directory", "").with_children(vec![move_node]); @@ -880,9 +1008,11 @@ mod tests { .with_tag("binoc.move.modified") .with_tag("binoc.content-changed") .with_tag("binoc.lines-added"); - move_node - .annotations - .insert("content_summary".into(), serde_json::json!("2 lines added")); + move_node.annotate_from( + "binoc", + "content_summary", + serde_json::json!("2 lines added"), + ); let root = DiffNode::new("modify", "directory", "").with_children(vec![move_node]); let md = render_markdown( @@ -906,13 +1036,16 @@ mod tests { .with_source_path("data.csv") .with_summary("Moved from data.csv (modified)") .with_tag("binoc.move"); - move_node.annotations.insert( - "tabular_summary".into(), + move_node.annotate_from( + "binoc", + "tabular_summary", serde_json::json!("Column added: 'email'"), ); - move_node - .annotations - .insert("content_summary".into(), serde_json::json!("CSV modified")); + move_node.annotate_from( + "binoc", + "content_summary", + serde_json::json!("CSV modified"), + ); let root = DiffNode::new("modify", "directory", "").with_children(vec![move_node]); let md = render_markdown( @@ -1055,6 +1188,61 @@ mod tests { assert!(!md.contains("showing 1 of 4")); } + #[test] + fn examples_verbosity_renders_string_list_annotations() { + let mut node = + DiffNode::new("modify", "tabular", "data.csv").with_summary("3 rows modified"); + node.annotate_from( + "binoc", + "distribution_shifts", + serde_json::json!([ + "column 'score': mean 20 -> 35.5", + "column 'rank': mean 2 -> 3", + "column 'cost': mean 10 -> 12", + "column 'height': mean 70 -> 72" + ]), + ); + + let md = render_markdown( + &[Changeset::new( + "a", + "b", + Some(DiffNode::new("modify", "directory", "").with_children(vec![node])), + )], + &MarkdownRendererConfig::default(), + ); + + assert!(md.contains("Distribution shifts (showing 3 of 4)")); + assert!(md.contains("column 'score': mean 20 -> 35.5")); + assert!(!md.contains("column 'height': mean 70 -> 72")); + } + + #[test] + fn summary_verbosity_hides_annotations() { + let mut node = + DiffNode::new("modify", "tabular", "data.csv").with_summary("3 rows modified"); + node.annotate_from( + "binoc", + "distribution_shifts", + serde_json::json!(["column 'score' changed"]), + ); + + let md = render_markdown( + &[Changeset::new( + "a", + "b", + Some(DiffNode::new("modify", "directory", "").with_children(vec![node])), + )], + &MarkdownRendererConfig { + verbosity: Verbosity::Summary, + ..Default::default() + }, + ); + + assert!(!md.contains("Distribution shifts")); + assert!(!md.contains("column 'score'")); + } + fn sample_detail_block(total_count: u64) -> DetailBlock { let mut block = DetailBlock::new("cells_changed", "binoc.tabular.cell_changes.v1") .with_label("Changed cells") diff --git a/binoc-stdlib/src/test_vectors.rs b/binoc-stdlib/src/test_vectors.rs index a4c7663..f3b79f0 100644 --- a/binoc-stdlib/src/test_vectors.rs +++ b/binoc-stdlib/src/test_vectors.rs @@ -333,6 +333,7 @@ pub fn abi_wrapped_default_registry() -> ( wrap_transformer!(folder_move_detector::FolderMoveDetector); wrap_transformer!(table_splitter::TableSplitter); wrap_transformer!(tabular_analyzer::TabularAnalyzer); + wrap_transformer!(tabular_stats_annotator::TabularStatsAnnotator); wrap_transformer!(column_reorder::ColumnReorderDetector); wrap_transformer!(table_collection_analyzer::TableCollectionAnalyzer); diff --git a/binoc-stdlib/src/transformers/folder_move_detector.rs b/binoc-stdlib/src/transformers/folder_move_detector.rs index 86dd808..b05203c 100644 --- a/binoc-stdlib/src/transformers/folder_move_detector.rs +++ b/binoc-stdlib/src/transformers/folder_move_detector.rs @@ -248,8 +248,8 @@ fn has_modification_detail(node: &DiffNode) -> bool { node.tags.contains("binoc.move.modified") || node.tags.contains("binoc.copy.modified") || node.tags.contains("binoc.content-changed") - || node.annotations.contains_key("content_summary") - || node.annotations.contains_key("tabular_summary") + || node.binoc_annotation("content_summary").is_some() + || node.binoc_annotation("tabular_summary").is_some() || !node.children.is_empty() } @@ -367,13 +367,11 @@ fn normalize_rollup_remainder_leaf(node: DiffNode, rollup: &Rollup) -> DiffNode fn maybe_demote_move_like_remainder(mut node: DiffNode) -> DiffNode { if matches!(node.action.as_str(), "move" | "copy") { let detail = node - .annotations - .get("tabular_summary") - .and_then(|v| v.as_str()) + .binoc_annotation("tabular_summary") + .and_then(Annotation::as_str) .or_else(|| { - node.annotations - .get("content_summary") - .and_then(|v| v.as_str()) + node.binoc_annotation("content_summary") + .and_then(Annotation::as_str) }) .map(str::to_string); node.action = "modify".to_string(); diff --git a/binoc-stdlib/src/transformers/mod.rs b/binoc-stdlib/src/transformers/mod.rs index 29430b7..b7651ca 100644 --- a/binoc-stdlib/src/transformers/mod.rs +++ b/binoc-stdlib/src/transformers/mod.rs @@ -7,3 +7,4 @@ pub mod fuzzy_correlation_detector; pub mod table_collection_analyzer; pub mod table_splitter; pub mod tabular_analyzer; +pub mod tabular_stats_annotator; diff --git a/binoc-stdlib/src/transformers/tabular_analyzer.rs b/binoc-stdlib/src/transformers/tabular_analyzer.rs index 53ebed9..c64debd 100644 --- a/binoc-stdlib/src/transformers/tabular_analyzer.rs +++ b/binoc-stdlib/src/transformers/tabular_analyzer.rs @@ -278,8 +278,7 @@ fn transform_modify( // stash the tabular description as an annotation so renderers can // surface it if they want without overwriting "Moved from ...". if node.action == "move" { - node.annotations - .insert("tabular_summary".into(), serde_json::json!(tabular_desc)); + node.annotate_from("binoc", "tabular_summary", serde_json::json!(tabular_desc)); } else { if tabular_desc != "Table modified" || node.summary.is_none() { node.summary = Some(tabular_desc); @@ -437,8 +436,7 @@ fn transform_modify_keyed( ); if node.action == "move" { - node.annotations - .insert("tabular_summary".into(), serde_json::json!(tabular_desc)); + node.annotate_from("binoc", "tabular_summary", serde_json::json!(tabular_desc)); } else if tabular_desc != "Table modified" || node.summary.is_none() { node.summary = Some(tabular_desc); } diff --git a/binoc-stdlib/src/transformers/tabular_stats_annotator.rs b/binoc-stdlib/src/transformers/tabular_stats_annotator.rs new file mode 100644 index 0000000..4002d71 --- /dev/null +++ b/binoc-stdlib/src/transformers/tabular_stats_annotator.rs @@ -0,0 +1,560 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use binoc_sdk::*; +use serde::Deserialize; + +const DISTRIBUTION_ANNOTATION_KEY: &str = "distribution_shifts"; +const EPSILON: f64 = 1e-9; + +#[derive(Debug, Clone, Deserialize, Default)] +struct TabularStatsAnnotatorConfig { + #[serde(default)] + enabled: bool, +} + +/// Opt-in statistical annotation for changed tabular numeric columns. +/// +/// This transformer consumes `tabular_v1` artifacts and publishes a structured +/// distribution-shift annotation for numeric columns. It prefers keyed row +/// matching when row identity has already been configured, but falls back to +/// positional pairing. +pub struct TabularStatsAnnotator; + +impl Transformer for TabularStatsAnnotator { + fn descriptor(&self) -> TransformerDescriptor { + TransformerDescriptor::new("binoc.tabular_stats_annotator") + .with_match_artifacts(vec![tabular_v1()]) + .with_match_actions(vec!["modify".into(), "move".into()]) + } + + fn transform( + &self, + mut node: DiffNode, + data: &dyn DataAccess, + config: &serde_json::Value, + ) -> TransformResult { + let config: TabularStatsAnnotatorConfig = + serde_json::from_value(config.clone()).unwrap_or_default(); + if !config.enabled { + return TransformResult::Unchanged; + } + + let Some(pair) = TabularDataPair::from_artifacts(&node, data) else { + return TransformResult::Unchanged; + }; + let (Some(left), Some(right)) = (&pair.left, &pair.right) else { + return TransformResult::Unchanged; + }; + + let pairing = Pairing::for_node(&node, left, right); + let candidate_columns = candidate_columns(left, right, &pairing); + if candidate_columns.is_empty() { + return TransformResult::Unchanged; + } + + let lines: Vec = candidate_columns + .into_iter() + .filter_map(|column| distribution_change_for_column(&column, left, right, &pairing)) + .map(|change| change.to_annotation_line()) + .collect(); + + if lines.is_empty() { + return TransformResult::Unchanged; + } + + node.annotate_from( + "binoc", + DISTRIBUTION_ANNOTATION_KEY, + serde_json::json!(lines), + ); + TransformResult::Replace(Box::new(node)) + } +} + +#[derive(Debug, Clone)] +struct Pairing { + mode: &'static str, + matched: Vec<(usize, usize)>, + row_set_changed: bool, +} + +impl Pairing { + fn for_node(node: &DiffNode, left: &TabularData, right: &TabularData) -> Self { + if let Some(key_columns) = row_identity_columns(node, left, right) { + return keyed_pairing(left, right, &key_columns); + } + positional_pairing(left, right) + } +} + +fn positional_pairing(left: &TabularData, right: &TabularData) -> Pairing { + let matched_len = left.rows.len().min(right.rows.len()); + Pairing { + mode: "position", + matched: (0..matched_len).map(|idx| (idx, idx)).collect(), + row_set_changed: left.rows.len() != right.rows.len(), + } +} + +fn keyed_pairing(left: &TabularData, right: &TabularData, key_columns: &[String]) -> Pairing { + let left_index = index_rows_by_key(left, key_columns); + let right_index = index_rows_by_key(right, key_columns); + let keys: BTreeSet> = left_index + .keys() + .chain(right_index.keys()) + .cloned() + .collect(); + + let mut matched = Vec::new(); + let mut row_set_changed = false; + + for key in keys { + let left_rows = left_index.get(&key).cloned().unwrap_or_default(); + let right_rows = right_index.get(&key).cloned().unwrap_or_default(); + match (left_rows.as_slice(), right_rows.as_slice()) { + ([left_idx], [right_idx]) => matched.push((*left_idx, *right_idx)), + _ => row_set_changed = true, + } + } + + Pairing { + mode: "keyed", + matched, + row_set_changed, + } +} + +fn row_identity_columns( + node: &DiffNode, + left: &TabularData, + right: &TabularData, +) -> Option> { + let columns: Vec = node + .details + .get("row_identity") + .and_then(|value| value.get("columns")) + .and_then(|value| serde_json::from_value(value.clone()).ok())?; + if columns.is_empty() { + return None; + } + if columns + .iter() + .all(|column| left.column_index(column).is_some() && right.column_index(column).is_some()) + { + Some(columns) + } else { + None + } +} + +fn index_rows_by_key( + table: &TabularData, + key_columns: &[String], +) -> BTreeMap, Vec> { + let indices: Vec = key_columns + .iter() + .filter_map(|column| table.column_index(column)) + .collect(); + let mut by_key: BTreeMap, Vec> = BTreeMap::new(); + + for (row_index, row) in table.rows.iter().enumerate() { + let mut key = Vec::with_capacity(indices.len()); + let mut has_null = false; + for index in &indices { + let value = row.get(*index).map(|value| value.trim()).unwrap_or(""); + if value.is_empty() { + has_null = true; + break; + } + key.push(value.to_string()); + } + if !has_null { + by_key.entry(key).or_default().push(row_index); + } + } + + by_key +} + +fn candidate_columns(left: &TabularData, right: &TabularData, pairing: &Pairing) -> Vec { + let left_set: BTreeSet<&str> = left.headers.iter().map(String::as_str).collect(); + let common_columns: Vec = right + .headers + .iter() + .filter(|column| left_set.contains(column.as_str())) + .cloned() + .collect(); + + if pairing.row_set_changed { + return common_columns; + } + + common_columns + .into_iter() + .filter(|column| { + let Some(left_index) = left.column_index(column) else { + return false; + }; + let Some(right_index) = right.column_index(column) else { + return false; + }; + pairing.matched.iter().any(|(left_row, right_row)| { + left.rows[*left_row] + .get(left_index) + .map(String::as_str) + .unwrap_or("") + != right.rows[*right_row] + .get(right_index) + .map(String::as_str) + .unwrap_or("") + }) + }) + .collect() +} + +#[derive(Debug, Clone)] +struct ColumnNumbers { + values: Vec, + null_count: u64, +} + +fn distribution_change_for_column( + column: &str, + left: &TabularData, + right: &TabularData, + pairing: &Pairing, +) -> Option { + let left_numbers = numeric_column(left, column)?; + let right_numbers = numeric_column(right, column)?; + if left_numbers.values.is_empty() || right_numbers.values.is_empty() { + return None; + } + + let left_stats = stats_for_numbers(&left_numbers); + let right_stats = stats_for_numbers(&right_numbers); + let delta = NumericDistributionDelta { + null_count: right_stats.null_count as i64 - left_stats.null_count as i64, + min: right_stats.min - left_stats.min, + max: right_stats.max - left_stats.max, + mean: right_stats.mean - left_stats.mean, + median: right_stats.median - left_stats.median, + q1: right_stats.q1 - left_stats.q1, + q3: right_stats.q3 - left_stats.q3, + }; + let paired = paired_magnitude(left, right, column, pairing); + + if !distribution_changed(&left_stats, &right_stats, paired.as_ref()) { + return None; + } + + Some(NumericColumnDistributionChange { + column: column.to_string(), + pairing_mode: pairing.mode.to_string(), + left: left_stats, + right: right_stats, + delta, + paired, + }) +} + +#[derive(Debug, Clone, PartialEq)] +struct NumericDistributionStats { + count: u64, + null_count: u64, + min: f64, + max: f64, + mean: f64, + median: f64, + q1: f64, + q3: f64, +} + +#[derive(Debug, Clone, PartialEq)] +struct NumericDistributionDelta { + null_count: i64, + min: f64, + max: f64, + mean: f64, + median: f64, + q1: f64, + q3: f64, +} + +#[derive(Debug, Clone, PartialEq)] +struct NumericPairMagnitude { + compared_rows: u64, + changed_rows: u64, + mean_absolute_delta: f64, + max_absolute_delta: f64, +} + +#[derive(Debug, Clone, PartialEq)] +struct NumericColumnDistributionChange { + column: String, + pairing_mode: String, + left: NumericDistributionStats, + right: NumericDistributionStats, + delta: NumericDistributionDelta, + paired: Option, +} + +impl NumericColumnDistributionChange { + fn to_annotation_line(&self) -> String { + let mut line = format!( + "column '{}': mean {} -> {}, median {} -> {}, range {}-{} -> {}-{}", + self.column, + format_number(self.left.mean), + format_number(self.right.mean), + format_number(self.left.median), + format_number(self.right.median), + format_number(self.left.min), + format_number(self.left.max), + format_number(self.right.min), + format_number(self.right.max), + ); + if self.left.null_count != self.right.null_count { + line.push_str(&format!( + ", nulls {} -> {}", + self.left.null_count, self.right.null_count + )); + } + if let Some(paired) = &self.paired { + if paired.changed_rows > 0 { + line.push_str(&format!( + ", mean abs delta {} across {} paired row{}", + format_number(paired.mean_absolute_delta), + paired.changed_rows, + if paired.changed_rows == 1 { "" } else { "s" } + )); + } + } + line + } +} + +fn format_number(value: f64) -> String { + let rounded = if value.abs() < 1_000_000.0 { + format!("{value:.3}") + } else { + format!("{value:.6e}") + }; + rounded + .trim_end_matches('0') + .trim_end_matches('.') + .to_string() +} + +fn numeric_column(table: &TabularData, column: &str) -> Option { + let index = table.column_index(column)?; + let mut values = Vec::new(); + let mut null_count = 0_u64; + + for row in &table.rows { + let raw = row.get(index).map(String::as_str).unwrap_or("").trim(); + if raw.is_empty() { + null_count += 1; + continue; + } + let value = raw.parse::().ok()?; + if !value.is_finite() { + return None; + } + values.push(value); + } + + Some(ColumnNumbers { values, null_count }) +} + +fn stats_for_numbers(numbers: &ColumnNumbers) -> NumericDistributionStats { + let mut values = numbers.values.clone(); + values.sort_by(|left, right| left.total_cmp(right)); + let count = values.len() as u64; + NumericDistributionStats { + count, + null_count: numbers.null_count, + min: values[0], + max: values[count as usize - 1], + mean: values.iter().sum::() / count as f64, + median: quantile(&values, 0.5), + q1: quantile(&values, 0.25), + q3: quantile(&values, 0.75), + } +} + +fn quantile(values: &[f64], p: f64) -> f64 { + if values.len() == 1 { + return values[0]; + } + let position = p * (values.len() - 1) as f64; + let lower = position.floor() as usize; + let upper = position.ceil() as usize; + if lower == upper { + values[lower] + } else { + let weight = position - lower as f64; + values[lower] + (values[upper] - values[lower]) * weight + } +} + +fn paired_magnitude( + left: &TabularData, + right: &TabularData, + column: &str, + pairing: &Pairing, +) -> Option { + let left_index = left.column_index(column)?; + let right_index = right.column_index(column)?; + + let mut compared_rows = 0_u64; + let mut changed_rows = 0_u64; + let mut sum_abs_delta = 0.0_f64; + let mut max_abs_delta = 0.0_f64; + + for (left_row, right_row) in &pairing.matched { + let left_value = parse_numeric_cell(&left.rows[*left_row], left_index); + let right_value = parse_numeric_cell(&right.rows[*right_row], right_index); + let (Some(left_value), Some(right_value)) = (left_value, right_value) else { + continue; + }; + compared_rows += 1; + let abs_delta = (right_value - left_value).abs(); + if abs_delta > EPSILON { + changed_rows += 1; + sum_abs_delta += abs_delta; + if abs_delta > max_abs_delta { + max_abs_delta = abs_delta; + } + } + } + + if compared_rows == 0 { + return None; + } + + Some(NumericPairMagnitude { + compared_rows, + changed_rows, + mean_absolute_delta: if changed_rows == 0 { + 0.0 + } else { + sum_abs_delta / changed_rows as f64 + }, + max_absolute_delta: max_abs_delta, + }) +} + +fn parse_numeric_cell(row: &[String], index: usize) -> Option { + let raw = row.get(index).map(String::as_str).unwrap_or("").trim(); + if raw.is_empty() { + return None; + } + let value = raw.parse::().ok()?; + value.is_finite().then_some(value) +} + +fn distribution_changed( + left: &NumericDistributionStats, + right: &NumericDistributionStats, + paired: Option<&NumericPairMagnitude>, +) -> bool { + left.null_count != right.null_count + || (left.min - right.min).abs() > EPSILON + || (left.max - right.max).abs() > EPSILON + || (left.mean - right.mean).abs() > EPSILON + || (left.median - right.median).abs() > EPSILON + || (left.q1 - right.q1).abs() > EPSILON + || (left.q3 - right.q3).abs() > EPSILON + || paired.is_some_and(|paired| paired.changed_rows > 0) +} + +#[cfg(test)] +mod tests { + use super::*; + use binoc_core::data_access::LocalDataAccess; + + fn publish_pair( + data: &LocalDataAccess, + node: &mut DiffNode, + left: &TabularData, + right: &TabularData, + ) { + let left_bytes = serde_json::to_vec(left).unwrap(); + let right_bytes = serde_json::to_vec(right).unwrap(); + node.artifacts.push( + data.publish_artifact(&tabular_v1(), ArtifactSubject::Left, "test", &left_bytes) + .unwrap(), + ); + node.artifacts.push( + data.publish_artifact(&tabular_v1(), ArtifactSubject::Right, "test", &right_bytes) + .unwrap(), + ); + } + + #[test] + fn annotates_numeric_distribution_shift_for_changed_column() { + let data = LocalDataAccess::new(); + let mut node = DiffNode::new("modify", "tabular", "data.csv"); + node.details.insert( + "row_identity".into(), + serde_json::json!({ "columns": ["id"] }), + ); + publish_pair( + &data, + &mut node, + &TabularData { + headers: vec!["id".into(), "score".into(), "label".into()], + rows: vec![ + vec!["1".into(), "10".into(), "a".into()], + vec!["2".into(), "20".into(), "b".into()], + vec!["3".into(), "".into(), "c".into()], + ], + }, + &TabularData { + headers: vec!["id".into(), "score".into(), "label".into()], + rows: vec![ + vec!["1".into(), "15".into(), "a".into()], + vec!["2".into(), "35".into(), "b2".into()], + vec!["3".into(), "45".into(), "c".into()], + ], + }, + ); + + let result = + TabularStatsAnnotator.transform(node, &data, &serde_json::json!({ "enabled": true })); + + let TransformResult::Replace(node) = result else { + panic!("expected node replacement"); + }; + let annotation = node.binoc_annotation(DISTRIBUTION_ANNOTATION_KEY).unwrap(); + assert_eq!(annotation.package, "binoc"); + assert_eq!(annotation.key, "distribution_shifts"); + let lines: Vec = serde_json::from_value(annotation.value.clone()).unwrap(); + assert_eq!( + lines, + vec![ + "column 'score': mean 15 -> 31.667, median 15 -> 35, range 10-20 -> 15-45, nulls 1 -> 0, mean abs delta 10 across 2 paired rows" + ] + ); + } + + #[test] + fn stays_disabled_without_explicit_config() { + let data = LocalDataAccess::new(); + let mut node = DiffNode::new("modify", "tabular", "data.csv"); + publish_pair( + &data, + &mut node, + &TabularData { + headers: vec!["score".into()], + rows: vec![vec!["1".into()]], + }, + &TabularData { + headers: vec!["score".into()], + rows: vec![vec!["2".into()]], + }, + ); + + let result = TabularStatsAnnotator.transform(node, &data, &serde_json::Value::Null); + assert!(matches!(result, TransformResult::Unchanged)); + } +} diff --git a/binoc-stdlib/tests/transformer_tests.rs b/binoc-stdlib/tests/transformer_tests.rs index 460d924..950ee39 100644 --- a/binoc-stdlib/tests/transformer_tests.rs +++ b/binoc-stdlib/tests/transformer_tests.rs @@ -755,9 +755,11 @@ fn folder_move_rolls_up_partial_rename_and_keeps_remainders() { .with_tag("binoc.move") .with_tag("binoc.move.modified") .with_tag("binoc.content-changed"); - moved_modified - .annotations - .insert("content_summary".into(), serde_json::json!("2 lines added")); + moved_modified.annotate_from( + "binoc", + "content_summary", + serde_json::json!("2 lines added"), + ); let dst = DiffNode::new("add", "directory", "newdir").with_children(vec![ DiffNode::new("move", "file", "newdir/a.txt") diff --git a/docs/adr/2026-06-03-progressive_renderer_annotations.md b/docs/adr/2026-06-03-progressive_renderer_annotations.md new file mode 100644 index 0000000..51ba0a5 --- /dev/null +++ b/docs/adr/2026-06-03-progressive_renderer_annotations.md @@ -0,0 +1,94 @@ +# Progressive Renderer Annotations + +**Date:** 2026-06-03 +**Status:** Implemented + +## Context + +Comparators and transformers sometimes learn renderer-visible facts that do not +belong in the core action/type/tag model. For example, the tabular distribution +stats annotator can summarize a numeric distribution shift after the CSV +comparator has produced the tabular diff tree. A renderer should be able to +surface that fact without the transformer emitting Markdown and without the +core controller learning about tabular data. + +The first implementation path for distribution stats made Markdown aware of a +specific annotation shape. That worked for the feature, but it coupled a +general renderer to one transformer too early. The more important design +pressure is simple plugin composability: a transformer should be able to attach +small facts, and renderers should have a predictable fallback even when they do +not know the package-specific meaning. + +## Decision + +Add a renderer-visible `Annotation` record to the SDK/IR: + +```rust +pub struct Annotation { + pub package: String, + pub key: String, + pub value: serde_json::Value, +} +``` + +`DiffNode::annotate_from(package, key, value)` attaches an explicitly +namespaced annotation. Reusing the same `package` and `key` replaces the prior +value. The SDK deliberately does not provide `DiffNode::annotate(key, value)` +yet, because there is no ambient "current package" identity in comparator or +transformer calls. A future context-aware API can add that shorthand once it can +default to the calling package rather than to `binoc`. + +Annotation values are progressively typed JSON. Producers can start with a +string, a list of strings, or another simple JSON value. Renderers can consume +the generic JSON shape immediately, and can later add package/key-specific +handling if a stable semantic contract emerges. There is no separate version +field for now; the current contract is the `package`, `key`, and JSON `value` +shape itself. + +Markdown renders annotations generically: + +- scalar values render inline under a humanized annotation label; +- arrays of strings render as bounded bullet lists; +- other JSON values render as compact JSON. + +Markdown deliberately does not special-case annotation types for distribution +stats. The stats annotator publishes human-readable strings under +`binoc/distribution_shifts`, demonstrating that transformers can provide useful +annotations without requiring renderer-specific code. The existing +`content_summary` and `tabular_summary` annotations remain special only in the +sense that Markdown suppresses their generic rendering because they are already +consumed as move/correlation trailer text. + +## Alternatives Considered + +Keep `annotations` as a map from key to JSON value. + +: This is convenient, but it has no package namespace. Third-party plugin packs + would collide on common keys such as `summary`, `note`, or `stats`, and the + renderer could not distinguish first-party conventions from external ones. + +Add explicit annotation versions immediately. + +: Versioning will matter once renderers start depending on package/key-specific + structured values. It is premature for the initial composability contract, + where the default renderer is intentionally duck-typing simple JSON shapes. + A future typed annotation can encode its version in its key, package contract, + or value object when a real consumer needs that stability. + +Restrict annotations to strings. + +: Strings are the simplest interoperable value, and many annotations should use + them. However, allowing JSON lets producers group strings or publish compact + structured values without another IR migration. The renderer still treats + unknown structure generically. + +Let transformers emit Markdown. + +: That would make transformer output renderer-specific and would bypass the + project rule that significance and presentation remain renderer concerns. + +Add Markdown handlers for each known annotation. + +: That would optimize for the first feature instead of the extension point. + The current default renderer should prove the general contract before adding + package/key-specific display rules. diff --git a/docs/adr/README.md b/docs/adr/README.md index 68e48d3..1e95227 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -7,6 +7,7 @@ Newer entries appear first. Each entry shows its date and current status. Create | Date | Title | Status | |---|---|---| | 2026-06-03 | [Transformer-Initiated Recompare as a Correspondence Contract](2026-06-03-transformer_initiated_recompare.md) | Implemented | +| 2026-06-03 | [Progressive Renderer Annotations](2026-06-03-progressive_renderer_annotations.md) | Implemented | | 2026-06-03 | [Error Diagnostics Are Reportable Findings](2026-06-03-error_diagnostics_are_reportable_findings.md) | Implemented | | 2026-06-02 | [Markdown Renderer Groups Replace Significance-Map Grouping](2026-06-02-renderer_groups.md) | Implemented | | 2026-06-01 | [Unified Dataset Config and Identity Policy](2026-06-01-unified_dataset_config_and_identity.md) | Accepted | diff --git a/docs/explanation/test-vectors-gallery.md b/docs/explanation/test-vectors-gallery.md index f17c819..7e71086 100644 --- a/docs/explanation/test-vectors-gallery.md +++ b/docs/explanation/test-vectors-gallery.md @@ -13,7 +13,7 @@ audience: new user, data steward, archivist These are runnable examples from binoc's test suite. Each example links to its source folder on GitHub, tells you whether it needs any extra setup, gives you the exact command to run, and shows the Markdown changelog binoc is expected to print. -Binoc currently ships **37 shared examples** in this gallery. +Binoc currently ships **38 shared examples** in this gallery. ## One-time setup @@ -34,8 +34,9 @@ just materialize | [`csv-column-addition`](#csv-column-addition) | New column added | data.csv: Column added: 'email' | Default pipeline | | [`csv-column-removal`](#csv-column-removal) | Column removed | data.csv: Column removed: 'city' | Default pipeline | | [`csv-column-reorder`](#csv-column-reorder) | Columns shuffled, content identical | data.csv: Columns reordered (content unchanged) | Custom config | -| [`csv-keyed-null-duplicate`](#csv-keyed-null-duplicate) | Configured CSV row keys surface null and duplicate key diagnostics | data.csv: 4 rows added by key; 4 rows removed by key; 1 row modified by key | Default pipeline | -| [`csv-keyed-row-diff`](#csv-keyed-row-diff) | Configured CSV row keys match reordered rows and report keyed row/cell changes | data.csv: 1 row added by key; 1 row removed by key; 1 row modified by key | Default pipeline | +| [`csv-distribution-shift`](#csv-distribution-shift) | Numeric column distribution shifts with keyed row matching | data.csv: 4 rows modified by key | Custom config | +| [`csv-keyed-null-duplicate`](#csv-keyed-null-duplicate) | Configured CSV row keys surface null and duplicate key diagnostics | data.csv: 4 rows added by key; 4 rows removed by key; 1 row modified by key | Custom config | +| [`csv-keyed-row-diff`](#csv-keyed-row-diff) | Configured CSV row keys match reordered rows and report keyed row/cell changes | data.csv: 1 row added by key; 1 row removed by key; 1 row modified by key | Custom config | | [`csv-mixed-changes`](#csv-mixed-changes) | Multiple change types | data.csv: Column added: 'email'; columns reordered; 1 row added | Default pipeline | | [`csv-rename-modify`](#csv-rename-modify) | CSV renamed and a column added: detected as a single move with content diff via fuzzy correlation | data_v2.csv: Moved from data.csv (modified) | Default pipeline | | [`csv-row-addition`](#csv-row-addition) | New rows appended | data.csv: 2 rows added | Default pipeline | @@ -45,8 +46,8 @@ just materialize | [`directory-file-copy`](#directory-file-copy) | New file with same content as an existing unchanged file detected as a copy | duplicate.txt: Copied from original.txt | Default pipeline | | [`directory-nested`](#directory-nested) | Subdirectories with mixed changes | data/extra.csv: New table (2 columns, 1 row) | Default pipeline | | [`directory-nested-with-tar`](#directory-nested-with-tar) | Shows binoc diffing a tar archive and a plain directory that contain overlapping internal paths. | data/records.csv: 1 row added | Default pipeline | -| [`file-correspondence-scheme`](#file-correspondence-scheme) | Config declares that a state CSV moved into a new directory scheme is the same logical file | states/AL.csv: 1 row added | Default pipeline | -| [`file-correspondence-token`](#file-correspondence-token) | Config declares that year-stamped CSV filenames are the same logical file | running_list.csv: 1 row added | Default pipeline | +| [`file-correspondence-scheme`](#file-correspondence-scheme) | Config declares that a state CSV moved into a new directory scheme is the same logical file | states/AL.csv: 1 row added | Custom config | +| [`file-correspondence-token`](#file-correspondence-token) | Config declares that year-stamped CSV filenames are the same logical file | running_list.csv: 1 row added | Custom config | | [`folder-move-nested`](#folder-move-nested) | Detects a whole-folder rename and rolls many file moves up into one folder-move entry. | documentation: Folder moved from docs | Default pipeline | | [`folder-move-partial`](#folder-move-partial) | Detects a mostly-moved folder rename and preserves only the added/removed/modified remainder entries beneath it. | FoodData_Central_csv_2026-04-30: Folder moved from FoodData_Central_csv_2,025-12-18 | Custom config | | [`gzip-inner-dispatch`](#gzip-inner-dispatch) | Gzipped CSV and text are decompressed and redispatched under their inner names | census.txt: 1 line added, 1 removed | Default pipeline | @@ -192,6 +193,49 @@ Result: - **data.csv**: Columns reordered (content unchanged) ``` +## csv-distribution-shift + +Numeric column distribution shifts with keyed row matching + +- **Browse source:** [csv-distribution-shift](https://github.com/harvard-lil/binoc/tree/main/test-vectors/csv-distribution-shift) +- **Tags:** `csv`, `statistics`, `row-identity` +- **Snapshots:** `snapshot-a` has 1 file — `data.csv`; `snapshot-b` has 1 file — `data.csv` +- **Setup:** This example uses a custom dataset config to narrow the pipeline to the comparators and transformers that make the behavior obvious. +Save this dataset config as `/tmp/csv-distribution-shift.yaml`: + +```yaml +dataset: + tables: + defaults: + row_identity: + columns: + - id +transformer_config: + binoc.tabular_stats_annotator: + enabled: true +``` + + +Run it: +```bash +binoc diff \ + ./test-vectors-materialized/csv-distribution-shift/snapshot-a \ + ./test-vectors-materialized/csv-distribution-shift/snapshot-b \ + --config /tmp/csv-distribution-shift.yaml +``` +Result: +```markdown +# Changelog: snapshot-a → snapshot-b + +- **data.csv**: 4 rows modified by key + - Distribution shifts + - column 'score': mean 20 -> 35.5, median 20 -> 40, range 10-30 -> 12-50, nulls 1 -> 0, mean abs delta 10.667 across 3 paired rows + - Changed cells (showing 3 of 5); use `binoc extract CHANGESET "data.csv" cells_changed` for all changed cells + - key id '1', column 'score': '10' -> '12' + - key id '2', column 'label': 'beta' -> 'beta2' + - key id '2', column 'score': '20' -> '35' +``` + ## csv-keyed-null-duplicate Configured CSV row keys surface null and duplicate key diagnostics @@ -199,12 +243,29 @@ Configured CSV row keys surface null and duplicate key diagnostics - **Browse source:** [csv-keyed-null-duplicate](https://github.com/harvard-lil/binoc/tree/main/test-vectors/csv-keyed-null-duplicate) - **Tags:** `csv`, `keyed`, `null-key`, `duplicate-key` - **Snapshots:** `snapshot-a` has 1 file — `data.csv`; `snapshot-b` has 1 file — `data.csv` +- **Setup:** This example uses a custom dataset config to narrow the pipeline to the comparators and transformers that make the behavior obvious. +Save this dataset config as `/tmp/csv-keyed-null-duplicate.yaml`: + +```yaml +dataset: + tables: + defaults: + row_identity: + on_null_key: diagnostic + on_duplicate_key: diagnostic + entries: + - path_regex: ^data\.csv$ + columns: + - id +``` + Run it: ```bash binoc diff \ ./test-vectors-materialized/csv-keyed-null-duplicate/snapshot-a \ - ./test-vectors-materialized/csv-keyed-null-duplicate/snapshot-b + ./test-vectors-materialized/csv-keyed-null-duplicate/snapshot-b \ + --config /tmp/csv-keyed-null-duplicate.yaml ``` Result: ```markdown @@ -227,12 +288,24 @@ Configured CSV row keys match reordered rows and report keyed row/cell changes - **Browse source:** [csv-keyed-row-diff](https://github.com/harvard-lil/binoc/tree/main/test-vectors/csv-keyed-row-diff) - **Tags:** `csv`, `keyed`, `row-addition`, `row-removal`, `cell-change` - **Snapshots:** `snapshot-a` has 1 file — `data.csv`; `snapshot-b` has 1 file — `data.csv` +- **Setup:** This example uses a custom dataset config to narrow the pipeline to the comparators and transformers that make the behavior obvious. +Save this dataset config as `/tmp/csv-keyed-row-diff.yaml`: + +```yaml +dataset: + tables: + - path_regex: ^data\.csv$ + columns: + - id +``` + Run it: ```bash binoc diff \ ./test-vectors-materialized/csv-keyed-row-diff/snapshot-a \ - ./test-vectors-materialized/csv-keyed-row-diff/snapshot-b + ./test-vectors-materialized/csv-keyed-row-diff/snapshot-b \ + --config /tmp/csv-keyed-row-diff.yaml ``` Result: ```markdown @@ -462,12 +535,31 @@ Config declares that a state CSV moved into a new directory scheme is the same l - **Browse source:** [file-correspondence-scheme](https://github.com/harvard-lil/binoc/tree/main/test-vectors/file-correspondence-scheme) - **Tags:** `csv`, `file-correspondence`, `scheme-change` - **Snapshots:** `snapshot-a` has 1 file — `data/state_AL.csv`; `snapshot-b` has 1 file — `by-state/AL/records.csv` +- **Setup:** This example uses a custom dataset config to narrow the pipeline to the comparators and transformers that make the behavior obvious. +Save this dataset config as `/tmp/file-correspondence-scheme.yaml`: + +```yaml +dataset: + files: + correspondences: + - name: state-records + key: "${state}" + logical_path: "states/${state}.csv" + on_null_key: diagnostic + on_duplicate_key: diagnostic + left: + path_regex: "^data/state_(?P[A-Z]{2})\\.csv$" + right: + path_regex: "^by-state/(?P[A-Z]{2})/records\\.csv$" +``` + Run it: ```bash binoc diff \ ./test-vectors-materialized/file-correspondence-scheme/snapshot-a \ - ./test-vectors-materialized/file-correspondence-scheme/snapshot-b + ./test-vectors-materialized/file-correspondence-scheme/snapshot-b \ + --config /tmp/file-correspondence-scheme.yaml ``` Result: ```markdown @@ -483,12 +575,31 @@ Config declares that year-stamped CSV filenames are the same logical file - **Browse source:** [file-correspondence-token](https://github.com/harvard-lil/binoc/tree/main/test-vectors/file-correspondence-token) - **Tags:** `csv`, `file-correspondence`, `declared-correspondence` - **Snapshots:** `snapshot-a` has 1 file — `running_list_as_of_2022.csv`; `snapshot-b` has 1 file — `running_list_as_of_2023.csv` +- **Setup:** This example uses a custom dataset config to narrow the pipeline to the comparators and transformers that make the behavior obvious. +Save this dataset config as `/tmp/file-correspondence-token.yaml`: + +```yaml +dataset: + files: + correspondences: + - name: running-list + key: "${list}" + logical_path: "${list}.csv" + on_null_key: diagnostic + on_duplicate_key: diagnostic + left: + path_regex: "^(?Prunning_list)_as_of_[0-9]{4}\\.csv$" + right: + path_regex: "^(?Prunning_list)_as_of_[0-9]{4}\\.csv$" +``` + Run it: ```bash binoc diff \ ./test-vectors-materialized/file-correspondence-token/snapshot-a \ - ./test-vectors-materialized/file-correspondence-token/snapshot-b + ./test-vectors-materialized/file-correspondence-token/snapshot-b \ + --config /tmp/file-correspondence-token.yaml ``` Result: ```markdown diff --git a/docs/howto/write-a-python-comparator.md b/docs/howto/write-a-python-comparator.md index 5c79470..2235dcd 100644 --- a/docs/howto/write-a-python-comparator.md +++ b/docs/howto/write-a-python-comparator.md @@ -133,6 +133,7 @@ Nodes are immutable-ish — builder methods return new nodes: node = binoc.DiffNode(action="modify", item_type="fasta", path="seqs.fa") node = node.with_tag("biobinoc.gap-change") node = node.with_detail("gap_count", 42) +node = node.annotate_from("biobinoc", "note", "large gap shift") node = node.with_source_path("old_seqs.fa") # for moves/renames node = node.with_children([child1, child2]) @@ -143,7 +144,7 @@ node.path # "seqs.fa" node.tags # ["biobinoc.gap-change"] node.details # {"gap_count": 42} node.children # [child1, child2] -node.annotations # {} — typically set by transformers +node.annotations # [{"package": "binoc", "key": "note", "value": "..."}] ``` ## Limits of Python comparators diff --git a/docs/reference/changeset-schema.json b/docs/reference/changeset-schema.json index 9f8d770..b20cf39 100644 --- a/docs/reference/changeset-schema.json +++ b/docs/reference/changeset-schema.json @@ -38,6 +38,24 @@ "to_snapshot" ], "$defs": { + "Annotation": { + "description": "Renderer-visible metadata attached to a diff node by a comparator or\ntransformer.\n\nAnnotations are intentionally progressively typed: producers can start with\na string or simple JSON value, and renderers can either display the generic\nvalue shape or add package/key-specific handling later. The package namespace\nkeeps independently-authored plugins from colliding on common keys.", + "type": "object", + "properties": { + "key": { + "type": "string" + }, + "package": { + "type": "string" + }, + "value": true + }, + "required": [ + "package", + "key", + "value" + ] + }, "ArtifactDescriptor": { "description": "Descriptor for a published artifact attached to a node.\n\nArtifacts are the unified mechanism for both private reuse and\ncross-plugin composition. A comparator or transformer publishes\nzero or more artifacts; downstream plugins consume them by format.", "type": "object", @@ -226,9 +244,11 @@ "type": "string" }, "annotations": { - "description": "Transformer-added metadata.", - "type": "object", - "additionalProperties": true + "description": "Renderer-visible annotations supplied by comparators or transformers.", + "type": "array", + "items": { + "$ref": "#/$defs/Annotation" + } }, "artifacts": { "description": "Published artifacts for this node. Session-scoped working data: carried\nacross the plugin ABI wire as descriptors (the bytes live in the shared\n`data_root` cache), but not meaningful outside a session. Callers\nwriting changeset output must strip this via\n[`DiffNode::strip_transient`] before serializing.", diff --git a/docs/reference/changeset-schema.md b/docs/reference/changeset-schema.md index 6505a27..44243e4 100644 --- a/docs/reference/changeset-schema.md +++ b/docs/reference/changeset-schema.md @@ -72,7 +72,7 @@ A node in the diff tree — the central data structure of the system. Every comp | Field | Type | Required | Description | |---|---|---|---| | `action` | string | yes | Open enum: "add", "remove", "modify", "move", "reorder", "schema_change", etc. Plugins may define new actions. | -| `annotations` | object (free-form) | no | Transformer-added metadata. | +| `annotations` | array of [`Annotation`](#annotation) | no | Renderer-visible annotations supplied by comparators or transformers. | | `artifacts` | array of [`ArtifactDescriptor`](#artifactdescriptor) | no | Published artifacts for this node. Session-scoped working data: carried across the plugin ABI wire as descriptors (the bytes live in the shared `data_root` cache), but not meaningful outside a session. Callers writing changeset output must strip this via [`DiffNode::strip_transient`] before serializing. | | `children` | array of [`DiffNode`](#diffnode) | no | Child diff nodes forming the tree structure. | | `comparator` | string \| null | no | Which comparator produced this node (provenance for extract chain). | @@ -141,6 +141,16 @@ String enum. One of: - `right` - `pair` +### `Annotation` + +Renderer-visible metadata attached to a diff node by a comparator or transformer. Annotations are intentionally progressively typed: producers can start with a string or simple JSON value, and renderers can either display the generic value shape or add package/key-specific handling later. The package namespace keeps independently-authored plugins from colliding on common keys. + +| Field | Type | Required | Description | +|---|---|---|---| +| `key` | string | yes | | +| `package` | string | yes | | +| `value` | any | yes | | + ### `DetailBlock` Renderer-visible, bounded evidence attached to a diff node. diff --git a/mkdocs.yml b/mkdocs.yml index 5cb7342..766029e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -162,6 +162,7 @@ nav: # BEGIN-ADR-NAV - adr/README.md - 'Transformer-Initiated Recompare as a Correspondence Contract': adr/2026-06-03-transformer_initiated_recompare.md + - 'Progressive Renderer Annotations': adr/2026-06-03-progressive_renderer_annotations.md - 'Error Diagnostics Are Reportable Findings': adr/2026-06-03-error_diagnostics_are_reportable_findings.md - 'Markdown Renderer Groups Replace Significance-Map Grouping': adr/2026-06-02-renderer_groups.md - 'Unified Dataset Config and Identity Policy': adr/2026-06-01-unified_dataset_config_and_identity.md diff --git a/scripts/build_test_vector_gallery.py b/scripts/build_test_vector_gallery.py index 17ca6d4..64c6c47 100644 --- a/scripts/build_test_vector_gallery.py +++ b/scripts/build_test_vector_gallery.py @@ -60,6 +60,7 @@ class DocsMeta: class ManifestConfig: comparators: list[str] | None = None transformers: list[str] | None = None + dataset: dict[str, Any] | None = None output: dict[str, Any] | None = None transformer_config: dict[str, Any] | None = None @@ -70,6 +71,7 @@ def has_customization(self) -> bool: for value in ( self.comparators, self.transformers, + self.dataset, self.output, self.transformer_config, ) @@ -172,6 +174,7 @@ def _parse_config(manifest_path: Path, raw: Any) -> ManifestConfig: comparators = raw.get("comparators") transformers = raw.get("transformers") + dataset = raw.get("dataset") output = raw.get("output") transformer_config = raw.get("transformer_config") @@ -183,6 +186,8 @@ def _parse_config(manifest_path: Path, raw: Any) -> ManifestConfig: transformers = _validate_str_list( transformers, f"{manifest_path}: [config].transformers" ) + if dataset is not None and not isinstance(dataset, dict): + _die(f"{manifest_path}: [config].dataset must be a table") if output is not None and not isinstance(output, dict): _die(f"{manifest_path}: [config].output must be a table") if transformer_config is not None and not isinstance(transformer_config, dict): @@ -191,6 +196,7 @@ def _parse_config(manifest_path: Path, raw: Any) -> ManifestConfig: return ManifestConfig( comparators=comparators, transformers=transformers, + dataset=dataset, output=output, transformer_config=transformer_config, ) @@ -250,6 +256,8 @@ def _render_config_yaml(config: ManifestConfig) -> str | None: data["comparators"] = config.comparators if config.transformers: data["transformers"] = config.transformers + if config.dataset: + data["dataset"] = config.dataset if config.transformer_config: data["transformer_config"] = config.transformer_config if config.output: diff --git a/test-vectors/csv-cell-changes/expected-output/abi-log.snap b/test-vectors/csv-cell-changes/expected-output/abi-log.snap index a6607f1..1f96110 100644 --- a/test-vectors/csv-cell-changes/expected-output/abi-log.snap +++ b/test-vectors/csv-cell-changes/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-column-addition/expected-output/abi-log.snap b/test-vectors/csv-column-addition/expected-output/abi-log.snap index 8a0f03f..e78e67d 100644 --- a/test-vectors/csv-column-addition/expected-output/abi-log.snap +++ b/test-vectors/csv-column-addition/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-column-removal/expected-output/abi-log.snap b/test-vectors/csv-column-removal/expected-output/abi-log.snap index 389aa60..09bab9e 100644 --- a/test-vectors/csv-column-removal/expected-output/abi-log.snap +++ b/test-vectors/csv-column-removal/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-distribution-shift/expected-output/abi-log.snap b/test-vectors/csv-distribution-shift/expected-output/abi-log.snap new file mode 100644 index 0000000..fbd3b9d --- /dev/null +++ b/test-vectors/csv-distribution-shift/expected-output/abi-log.snap @@ -0,0 +1,166 @@ +--- +source: binoc-stdlib/src/test_vectors.rs +expression: "&abi_log" +--- +[ + { + "method": "compare", + "plugin": "binoc.directory", + "data_ops": [ + { + "op": "local_path", + "handle": "" + }, + { + "op": "local_path", + "handle": "" + }, + { + "op": "register_local", + "logical": "data.csv" + }, + { + "op": "read_bytes", + "handle": "data.csv", + "result_size": 56 + }, + { + "op": "register_local", + "logical": "data.csv" + }, + { + "op": "read_bytes", + "handle": "data.csv", + "result_size": 59 + } + ] + }, + { + "method": "compare", + "plugin": "binoc.csv", + "data_ops": [ + { + "op": "local_path", + "handle": "data.csv" + }, + { + "op": "local_path", + "handle": "data.csv" + }, + { + "op": "publish_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "subject": "Left", + "producer": "binoc.csv", + "size": 116 + }, + { + "op": "publish_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "subject": "Right", + "producer": "binoc.csv", + "size": 119 + } + ] + }, + { + "method": "transform", + "plugin": "binoc.declared_correspondence", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.correlation_detector", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.fuzzy_correlation_detector", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.folder_move_detector", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.table_splitter", + "data_ops": [ + { + "op": "get_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "found": true + }, + { + "op": "get_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "found": true + } + ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_analyzer", + "data_ops": [ + { + "op": "get_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "found": true + }, + { + "op": "get_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "found": true + } + ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [ + { + "op": "get_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "found": true + }, + { + "op": "get_artifact", + "format": { + "package": "binoc", + "name": "tabular", + "version": 1 + }, + "found": true + } + ] + } +] diff --git a/test-vectors/csv-distribution-shift/expected-output/changelog.snap b/test-vectors/csv-distribution-shift/expected-output/changelog.snap new file mode 100644 index 0000000..64d06fb --- /dev/null +++ b/test-vectors/csv-distribution-shift/expected-output/changelog.snap @@ -0,0 +1,13 @@ +--- +source: binoc-stdlib/src/test_vectors.rs +expression: "&md" +--- +# Changelog: snapshot-a → snapshot-b + +- **data.csv**: 4 rows modified by key + - Distribution shifts + - column 'score': mean 20 -> 35.5, median 20 -> 40, range 10-30 -> 12-50, nulls 1 -> 0, mean abs delta 10.667 across 3 paired rows + - Changed cells (showing 3 of 5); use `binoc extract CHANGESET "data.csv" cells_changed` for all changed cells + - key id '1', column 'score': '10' -> '12' + - key id '2', column 'label': 'beta' -> 'beta2' + - key id '2', column 'score': '20' -> '35' diff --git a/test-vectors/csv-distribution-shift/expected-output/changeset.snap b/test-vectors/csv-distribution-shift/expected-output/changeset.snap new file mode 100644 index 0000000..07d6fd1 --- /dev/null +++ b/test-vectors/csv-distribution-shift/expected-output/changeset.snap @@ -0,0 +1,166 @@ +--- +source: binoc-stdlib/src/test_vectors.rs +expression: "&stable_changeset" +--- +{ + "from_snapshot": "snapshot-a", + "to_snapshot": "snapshot-b", + "root": { + "action": "modify", + "item_type": "directory", + "path": "", + "children": [ + { + "action": "modify", + "item_type": "tabular", + "path": "data.csv", + "summary": "4 rows modified by key", + "tags": [ + "binoc.cell-change" + ], + "details": { + "cells_changed": 5, + "columns_added": [], + "columns_left": [ + "id", + "score", + "label" + ], + "columns_removed": [], + "columns_right": [ + "id", + "score", + "label" + ], + "hash_left": "80034b65c9113ce13204651a58eb57726dd055894d2c8d5b7a3a6943d24d5941", + "hash_right": "9ffd3e0e8207a6ce0f0454302c45c39815d2179e3562dd924f971fdc989eeb15", + "row_identity": { + "columns": [ + "id" + ], + "matched_rows": 4, + "mode": "keyed", + "on_duplicate_key": "diagnostic", + "on_null_key": "diagnostic" + }, + "rows_added": 0, + "rows_left": 4, + "rows_modified": 4, + "rows_removed": 0, + "rows_right": 4 + }, + "detail_blocks": [ + { + "id": "cells_changed", + "kind": "binoc.tabular.cell_changes.v1", + "label": "Changed cells", + "total_count": 5, + "examples": [ + { + "locator": { + "column": "score", + "key": { + "id": "1" + } + }, + "before": { + "value": "10", + "media_type": "text/plain" + }, + "after": { + "value": "12", + "media_type": "text/plain" + } + }, + { + "locator": { + "column": "label", + "key": { + "id": "2" + } + }, + "before": { + "value": "beta", + "media_type": "text/plain" + }, + "after": { + "value": "beta2", + "media_type": "text/plain" + } + }, + { + "locator": { + "column": "score", + "key": { + "id": "2" + } + }, + "before": { + "value": "20", + "media_type": "text/plain" + }, + "after": { + "value": "35", + "media_type": "text/plain" + } + }, + { + "locator": { + "column": "score", + "key": { + "id": "3" + } + }, + "before": { + "value": "30", + "media_type": "text/plain" + }, + "after": { + "value": "45", + "media_type": "text/plain" + } + }, + { + "locator": { + "column": "score", + "key": { + "id": "4" + } + }, + "before": { + "value": "", + "media_type": "text/plain" + }, + "after": { + "value": "50", + "media_type": "text/plain" + } + } + ], + "extract": [ + { + "aspect": "cells_changed", + "label": "All changed cells" + } + ] + } + ], + "annotations": [ + { + "package": "binoc", + "key": "distribution_shifts", + "value": [ + "column 'score': mean 20 -> 35.5, median 20 -> 40, range 10-30 -> 12-50, nulls 1 -> 0, mean abs delta 10.667 across 3 paired rows" + ] + } + ], + "comparator": "binoc.csv", + "transformed_by": [ + "binoc.tabular_analyzer", + "binoc.tabular_stats_annotator" + ] + } + ], + "comparator": "binoc.directory" + } +} diff --git a/test-vectors/csv-distribution-shift/manifest.toml b/test-vectors/csv-distribution-shift/manifest.toml new file mode 100644 index 0000000..50926d8 --- /dev/null +++ b/test-vectors/csv-distribution-shift/manifest.toml @@ -0,0 +1,13 @@ +[vector] +name = "csv-distribution-shift" +description = "Numeric column distribution shifts with keyed row matching" +tags = ["csv", "statistics", "row-identity"] + +[config] +dataset = { tables = { defaults = { row_identity = { columns = ["id"] } } } } + +[config.transformer_config."binoc.tabular_stats_annotator"] +enabled = true + +[expected] +has_tags = ["binoc.cell-change"] diff --git a/test-vectors/csv-distribution-shift/snapshot-a/data.csv b/test-vectors/csv-distribution-shift/snapshot-a/data.csv new file mode 100644 index 0000000..fbb8be8 --- /dev/null +++ b/test-vectors/csv-distribution-shift/snapshot-a/data.csv @@ -0,0 +1,5 @@ +id,score,label +1,10,alpha +2,20,beta +3,30,gamma +4,,delta diff --git a/test-vectors/csv-distribution-shift/snapshot-b/data.csv b/test-vectors/csv-distribution-shift/snapshot-b/data.csv new file mode 100644 index 0000000..30bd792 --- /dev/null +++ b/test-vectors/csv-distribution-shift/snapshot-b/data.csv @@ -0,0 +1,5 @@ +id,score,label +1,12,alpha +2,35,beta2 +3,45,gamma +4,50,delta diff --git a/test-vectors/csv-keyed-null-duplicate/expected-output/abi-log.snap b/test-vectors/csv-keyed-null-duplicate/expected-output/abi-log.snap index 4472785..23a4e4e 100644 --- a/test-vectors/csv-keyed-null-duplicate/expected-output/abi-log.snap +++ b/test-vectors/csv-keyed-null-duplicate/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-keyed-row-diff/expected-output/abi-log.snap b/test-vectors/csv-keyed-row-diff/expected-output/abi-log.snap index 29f9c38..fe62ea5 100644 --- a/test-vectors/csv-keyed-row-diff/expected-output/abi-log.snap +++ b/test-vectors/csv-keyed-row-diff/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-mixed-changes/expected-output/abi-log.snap b/test-vectors/csv-mixed-changes/expected-output/abi-log.snap index 90ff896..d84bfea 100644 --- a/test-vectors/csv-mixed-changes/expected-output/abi-log.snap +++ b/test-vectors/csv-mixed-changes/expected-output/abi-log.snap @@ -139,6 +139,11 @@ expression: "&abi_log" } ] }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] + }, { "method": "transform", "plugin": "binoc.column_reorder_detector", diff --git a/test-vectors/csv-rename-modify/expected-output/abi-log.snap b/test-vectors/csv-rename-modify/expected-output/abi-log.snap index 28e8ea2..207e6ac 100644 --- a/test-vectors/csv-rename-modify/expected-output/abi-log.snap +++ b/test-vectors/csv-rename-modify/expected-output/abi-log.snap @@ -191,5 +191,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-rename-modify/expected-output/changeset.snap b/test-vectors/csv-rename-modify/expected-output/changeset.snap index 684fa8b..dc76dfb 100644 --- a/test-vectors/csv-rename-modify/expected-output/changeset.snap +++ b/test-vectors/csv-rename-modify/expected-output/changeset.snap @@ -46,9 +46,13 @@ expression: "&stable_changeset" "rows_removed": 0, "rows_right": 3 }, - "annotations": { - "tabular_summary": "Column added: 'email'" - }, + "annotations": [ + { + "package": "binoc", + "key": "tabular_summary", + "value": "Column added: 'email'" + } + ], "comparator": "binoc.csv", "transformed_by": [ "binoc.tabular_analyzer" diff --git a/test-vectors/csv-row-addition/expected-output/abi-log.snap b/test-vectors/csv-row-addition/expected-output/abi-log.snap index c45397b..ecc6f89 100644 --- a/test-vectors/csv-row-addition/expected-output/abi-log.snap +++ b/test-vectors/csv-row-addition/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-row-removal/expected-output/abi-log.snap b/test-vectors/csv-row-removal/expected-output/abi-log.snap index 2b2dcda..b3e9de4 100644 --- a/test-vectors/csv-row-removal/expected-output/abi-log.snap +++ b/test-vectors/csv-row-removal/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/csv-stacked-tables/expected-output/abi-log.snap b/test-vectors/csv-stacked-tables/expected-output/abi-log.snap index e9112a6..a0c47f2 100644 --- a/test-vectors/csv-stacked-tables/expected-output/abi-log.snap +++ b/test-vectors/csv-stacked-tables/expected-output/abi-log.snap @@ -183,6 +183,11 @@ expression: "&abi_log" } ] }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] + }, { "method": "transform", "plugin": "binoc.table_collection_analyzer", diff --git a/test-vectors/csv-verbosity-full/expected-output/abi-log.snap b/test-vectors/csv-verbosity-full/expected-output/abi-log.snap index 3b1fd85..ec299e4 100644 --- a/test-vectors/csv-verbosity-full/expected-output/abi-log.snap +++ b/test-vectors/csv-verbosity-full/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/directory-nested-with-tar/expected-output/abi-log.snap b/test-vectors/directory-nested-with-tar/expected-output/abi-log.snap index e2b3822..5fdc006 100644 --- a/test-vectors/directory-nested-with-tar/expected-output/abi-log.snap +++ b/test-vectors/directory-nested-with-tar/expected-output/abi-log.snap @@ -322,5 +322,15 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/directory-nested/expected-output/abi-log.snap b/test-vectors/directory-nested/expected-output/abi-log.snap index 9bd9700..23c828e 100644 --- a/test-vectors/directory-nested/expected-output/abi-log.snap +++ b/test-vectors/directory-nested/expected-output/abi-log.snap @@ -276,5 +276,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/file-correspondence-scheme/expected-output/abi-log.snap b/test-vectors/file-correspondence-scheme/expected-output/abi-log.snap index c0b6091..7939780 100644 --- a/test-vectors/file-correspondence-scheme/expected-output/abi-log.snap +++ b/test-vectors/file-correspondence-scheme/expected-output/abi-log.snap @@ -222,5 +222,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/file-correspondence-token/expected-output/abi-log.snap b/test-vectors/file-correspondence-token/expected-output/abi-log.snap index 0119064..2fef560 100644 --- a/test-vectors/file-correspondence-token/expected-output/abi-log.snap +++ b/test-vectors/file-correspondence-token/expected-output/abi-log.snap @@ -180,5 +180,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/gzip-inner-dispatch/expected-output/abi-log.snap b/test-vectors/gzip-inner-dispatch/expected-output/abi-log.snap index bfcf356..d2b3a94 100644 --- a/test-vectors/gzip-inner-dispatch/expected-output/abi-log.snap +++ b/test-vectors/gzip-inner-dispatch/expected-output/abi-log.snap @@ -1,5 +1,6 @@ --- source: binoc-stdlib/src/test_vectors.rs +assertion_line: 507 expression: "&abi_log" --- [ @@ -228,5 +229,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/kitchen-sink/expected-output/abi-log.snap b/test-vectors/kitchen-sink/expected-output/abi-log.snap index da57741..edc59c4 100644 --- a/test-vectors/kitchen-sink/expected-output/abi-log.snap +++ b/test-vectors/kitchen-sink/expected-output/abi-log.snap @@ -705,6 +705,21 @@ expression: "&abi_log" } ] }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] + }, { "method": "transform", "plugin": "binoc.column_reorder_detector", diff --git a/test-vectors/single-file-modify-csv/expected-output/abi-log.snap b/test-vectors/single-file-modify-csv/expected-output/abi-log.snap index e42382d..be92539 100644 --- a/test-vectors/single-file-modify-csv/expected-output/abi-log.snap +++ b/test-vectors/single-file-modify-csv/expected-output/abi-log.snap @@ -106,5 +106,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/tar-nested/expected-output/abi-log.snap b/test-vectors/tar-nested/expected-output/abi-log.snap index b5eadc0..8a4e127 100644 --- a/test-vectors/tar-nested/expected-output/abi-log.snap +++ b/test-vectors/tar-nested/expected-output/abi-log.snap @@ -258,5 +258,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/tar-simple/expected-output/abi-log.snap b/test-vectors/tar-simple/expected-output/abi-log.snap index a19272e..aa00a98 100644 --- a/test-vectors/tar-simple/expected-output/abi-log.snap +++ b/test-vectors/tar-simple/expected-output/abi-log.snap @@ -232,5 +232,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/text-rename-modify/expected-output/changeset.snap b/test-vectors/text-rename-modify/expected-output/changeset.snap index 7dd2d01..c3d91fc 100644 --- a/test-vectors/text-rename-modify/expected-output/changeset.snap +++ b/test-vectors/text-rename-modify/expected-output/changeset.snap @@ -1,6 +1,5 @@ --- source: binoc-stdlib/src/test_vectors.rs -assertion_line: 477 expression: "&stable_changeset" --- { @@ -30,9 +29,13 @@ expression: "&stable_changeset" "lines_removed": 0, "lines_unchanged": 13 }, - "annotations": { - "content_summary": "2 lines added" - }, + "annotations": [ + { + "package": "binoc", + "key": "content_summary", + "value": "2 lines added" + } + ], "comparator": "binoc.text" } ], diff --git a/test-vectors/tsv-cell-changes/expected-output/abi-log.snap b/test-vectors/tsv-cell-changes/expected-output/abi-log.snap index cab34dd..a753169 100644 --- a/test-vectors/tsv-cell-changes/expected-output/abi-log.snap +++ b/test-vectors/tsv-cell-changes/expected-output/abi-log.snap @@ -138,5 +138,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ] diff --git a/test-vectors/zip-nested/expected-output/abi-log.snap b/test-vectors/zip-nested/expected-output/abi-log.snap index 4323a5e..544e047 100644 --- a/test-vectors/zip-nested/expected-output/abi-log.snap +++ b/test-vectors/zip-nested/expected-output/abi-log.snap @@ -258,5 +258,10 @@ expression: "&abi_log" "found": true } ] + }, + { + "method": "transform", + "plugin": "binoc.tabular_stats_annotator", + "data_ops": [] } ]