diff --git a/Cargo.lock b/Cargo.lock index 5769f5b..475b02e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -421,6 +421,7 @@ name = "git-ast" version = "0.1.0" dependencies = [ "cucumber", + "serde_json", "tempfile", "tokio", "tree-sitter", diff --git a/Cargo.toml b/Cargo.toml index eacddce..b8c1507 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,10 +9,13 @@ repository = "https://github.com/bounded-systems/git-ast" # Tree-sitter parses source into a concrete syntax tree; the `printer` module # walks that tree and re-emits canonical source. No libgit2 needed: the # clean/smudge filter speaks Git's pkt-line `filter-process` protocol over -# stdin/stdout, implemented in `pktline`. +# stdin/stdout, implemented in `pktline`. The `json` language uses serde_json: +# its `Map` is a `BTreeMap` (keys sorted) with deterministic scalar formatting — +# exactly the value-level normalization a canonical form needs. [dependencies] tree-sitter = "0.22" tree-sitter-rust = "0.21" +serde_json = "1" # The Gherkin suite (tests/features/*.feature) executes the README's claims # against real `git`, driving the built binary as the clean/smudge filter. diff --git a/README.md b/README.md index e6f2457..5e91a27 100644 --- a/README.md +++ b/README.md @@ -48,17 +48,19 @@ For a full documentation overview, see [Documentation Index](./docs/README.md). ## Project Status -**Working clean/smudge round-trip for a Rust subset.** The core pipeline is -implemented and runs through real Git: +**Working clean/smudge round-trip for two languages — JSON and a Rust subset.** +The core pipeline is implemented and runs through real Git: -- `git-ast setup` registers the filter in a repository. -- On `git add`, the `clean` filter parses Rust with Tree-sitter and stores its +- `git-ast setup` registers the filter in a repository (routes `*.rs` and + `*.json`). +- On `git add`, the `clean` filter parses the source and stores its **canonical** form; on `git checkout`, `smudge` returns it. Reformatting therefore never reaches history — two differently-formatted inputs that parse - to the same tree produce byte-identical blobs. Canonicalization is - **deterministic and idempotent** (guarded by property tests; see the - "Determinism contract" in [`src/printer.rs`](./src/printer.rs)), with the - canonical form versioned by the `(grammar, printer)` pair. + to the same structure produce byte-identical blobs. Canonicalization is + **deterministic and idempotent**. Rust is canonicalized via Tree-sitter (the + "Determinism contract" in [`src/printer.rs`](./src/printer.rs), versioned by + the `(grammar, printer)` pair); JSON via `serde_json` + ([`src/json.rs`](./src/json.rs) — sorted keys, pretty-printed). - It speaks Git's real `filter-process` pkt-line protocol, so `git add` / `git checkout` / `git diff` all work end to end. See [`examples/demo.sh`](./examples/demo.sh). @@ -68,11 +70,13 @@ implemented and runs through real Git: Honest boundaries: -- **One language, a subset of it.** The pretty-printer covers the constructs in - the example (functions, params, blocks, `let`, binary/call/macro expressions, - literals, comments). It is **fail-closed**: syntax errors reject the commit, - and any unsupported construct returns an error rather than corrupting code. - Widening coverage is additive — one more arm per node kind. +- **JSON is complete; Rust is a subset.** JSON canonicalization is total (any + valid JSON round-trips). The Rust pretty-printer covers the constructs in the + example (functions, params, blocks, `let`, binary/call/macro expressions, + literals, comments). Both are **fail-closed**: syntax errors reject the commit, + and any unsupported Rust construct returns an error rather than corrupting code. + Widening Rust coverage is additive — one more arm per node kind; adding a + language is one more arm in the filter's per-extension dispatch. - **Diff and merge drivers are still placeholders.** Making those *structural* depends on the hardest open problem — **stable AST node identity across versions** — which this does **not** solve. Canonical formatting removes diff --git a/src/filters.rs b/src/filters.rs index a280d13..eb86369 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -1,9 +1,10 @@ //! Clean/smudge filter over Git's long-running `filter-process` protocol. //! -//! `clean` (on `git add`) parses Rust source and stores its canonical form, so +//! `clean` (on `git add`) parses the source and stores its canonical form, so //! reformatting never reaches history. `smudge` (on `git checkout`) is identity: -//! the stored bytes are already canonical source. Only `*.rs` paths are -//! transformed; anything else passes through untouched. +//! the stored bytes are already canonical source. `*.rs` and `*.json` paths are +//! transformed (by [`crate::printer`] and [`crate::json`] respectively); anything +//! else passes through untouched. //! //! The conversation is the standard one documented in //! `Documentation/gitattributes.txt`: @@ -14,7 +15,7 @@ //! status line and the transformed content. use crate::pktline::{self, Packet}; -use crate::{printer, Error}; +use crate::{json, printer, Error}; use std::collections::HashMap; use std::io::{self, Read, Write}; use std::path::Path; @@ -102,13 +103,18 @@ fn process_one(input: &mut impl Read, output: &mut impl Write) -> Result Result, Error> { - let is_rust = Path::new(pathname).extension().is_some_and(|e| e == "rs"); + let ext = Path::new(pathname).extension().and_then(|e| e.to_str()); match command { - "clean" if is_rust => printer::canonicalize(content), - "smudge" | "clean" => Ok(content.to_vec()), + "clean" => match ext { + Some("rs") => printer::canonicalize(content), + Some("json") => json::canonicalize(content), + _ => Ok(content.to_vec()), + }, + "smudge" => Ok(content.to_vec()), other => Err(Error::Driver(format!("unknown filter command `{other}`"))), } } @@ -187,6 +193,26 @@ mod tests { assert_eq!(response_content(&out), canonical); } + #[test] + fn clean_canonicalizes_json() { + let req = client_stream("clean", "data.json", br#"{ "b":1, "a":2 }"#); + let mut out = Vec::new(); + converse(&mut &req[..], &mut out).unwrap(); + assert_eq!(response_content(&out), b"{\n \"a\": 2,\n \"b\": 1\n}\n"); + } + + #[test] + fn clean_reports_error_on_unparseable_json() { + let req = client_stream("clean", "bad.json", b"{nope}"); + let mut out = Vec::new(); + converse(&mut &req[..], &mut out).unwrap(); + let mut r = &out[..]; + pktline::read_until_flush(&mut r).unwrap(); // handshake + pktline::read_until_flush(&mut r).unwrap(); // capabilities + let status = pktline::read_until_flush(&mut r).unwrap().unwrap(); + assert_eq!(String::from_utf8_lossy(&status).trim_end(), "status=error"); + } + #[test] fn non_rust_passes_through_clean() { let req = client_stream("clean", "notes.txt", b" unchanged "); diff --git a/src/json.rs b/src/json.rs new file mode 100644 index 0000000..8c02017 --- /dev/null +++ b/src/json.rs @@ -0,0 +1,90 @@ +//! JSON canonicalizer. +//! +//! The companion to [`crate::printer`] (which canonicalizes Rust): same contract, +//! a different language. [`canonicalize`] parses JSON and re-emits a deterministic +//! canonical form — **object keys sorted, pretty-printed, trailing newline** — so +//! that two differently-formatted-but-equal JSON files store byte-identical blobs. +//! +//! Two facts make this a faithful canonical form: +//! +//! - **Sorted keys.** `serde_json::Map` is a `BTreeMap` (the `preserve_order` +//! feature is off), so keys are emitted in a stable order regardless of input. +//! - **Deterministic scalars.** `serde_json` formats numbers and strings +//! deterministically, so a given value always renders to the same bytes. +//! +//! This is RFC 8785 (JCS) *value-level* normalization — sorted keys, canonical +//! scalars — rendered *pretty* rather than compact, because the filter's purpose +//! is cleaner diffs and one value per line diffs far better than a single line. +//! +//! Like the Rust path it is **fail-closed**: unparseable JSON returns an error, +//! so `git add` aborts rather than storing junk. `smudge` is the identity (the +//! stored blob is already canonical source); see [`crate::filters`]. + +use crate::Error; + +/// Parse `source` as JSON and return its canonical form (sorted-key pretty JSON +/// with a trailing newline). Returns [`Error::Parsing`] if `source` is not valid +/// JSON. +pub fn canonicalize(source: &[u8]) -> Result, Error> { + let value: serde_json::Value = + serde_json::from_slice(source).map_err(|e| Error::Parsing(format!("invalid JSON: {e}")))?; + let mut out = + serde_json::to_vec_pretty(&value).map_err(|e| Error::Serialization(e.to_string()))?; + out.push(b'\n'); + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn canon(s: &str) -> String { + String::from_utf8(canonicalize(s.as_bytes()).unwrap()).unwrap() + } + + #[test] + fn sorts_keys_and_normalizes_whitespace() { + assert_eq!( + canon(r#"{ "b": 1, "a": 2 }"#), + "{\n \"a\": 2,\n \"b\": 1\n}\n" + ); + } + + #[test] + fn is_idempotent() { + let once = canonicalize(br#"{"z":[3,2,1],"a":{"d":4,"c":3}}"#).unwrap(); + let twice = canonicalize(&once).unwrap(); + assert_eq!(once, twice); + } + + #[test] + fn key_order_in_input_does_not_affect_output() { + let a = canon(r#"{"k2":2,"k1":1,"k3":{"n":[1,2,3]}}"#); + let b = canon(r#"{"k3":{"n":[1,2,3]},"k1":1,"k2":2}"#); + assert_eq!(a, b); + } + + #[test] + fn preserves_value_semantics() { + let src = r#"{ "b": 1, "a": [ {"y": true, "x": null} ], "s": "hi\n" }"#; + let before: serde_json::Value = serde_json::from_str(src).unwrap(); + let after: serde_json::Value = + serde_json::from_slice(&canonicalize(src.as_bytes()).unwrap()).unwrap(); + assert_eq!(before, after); + } + + #[test] + fn ends_with_single_newline() { + let out = canonicalize(br#"{"a":1}"#).unwrap(); + assert_eq!(out.last(), Some(&b'\n')); + assert_ne!(out[out.len() - 2], b'\n'); + } + + #[test] + fn rejects_invalid_json() { + assert!(matches!( + canonicalize(b"{not json,}"), + Err(Error::Parsing(_)) + )); + } +} diff --git a/src/lib.rs b/src/lib.rs index ecd4a10..7475219 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,13 +7,14 @@ //! //! ## Status //! -//! **Working clean/smudge round-trip for a Rust subset.** The `clean` filter -//! parses Rust with Tree-sitter and re-emits canonical source ([`printer`]), -//! driven over Git's real `filter-process` pkt-line protocol ([`pktline`], -//! [`filters`]) — so `git add`/`git checkout` normalize formatting end to end. -//! The printer covers a documented subset and is fail-closed outside it. The -//! diff and merge drivers ([`drivers`]) remain placeholders: making those -//! structural depends on stable node identity, which is out of scope (see +//! **Working clean/smudge round-trip for two languages.** The `clean` filter +//! canonicalizes Rust (a documented subset, via Tree-sitter — see [`printer`]) +//! and JSON (via `serde_json` — see [`json`]), driven over Git's real +//! `filter-process` pkt-line protocol ([`pktline`], [`filters`]) — so `git add`/ +//! `git checkout` normalize formatting end to end. Both paths are fail-closed: +//! unparseable input rejects the commit rather than storing junk. The diff and +//! merge drivers ([`drivers`]) remain placeholders: making those structural +//! depends on stable node identity, which is out of scope (see //! `docs/planning/scope.md`). //! //! ## Integration points @@ -29,6 +30,7 @@ pub mod config; pub mod drivers; pub mod filters; +pub mod json; pub mod pktline; pub mod printer; pub mod setup; diff --git a/src/main.rs b/src/main.rs index 75bbdc6..8e1cbe5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -71,16 +71,16 @@ fn print_help() { git-ast \n\ \n\ SUBCOMMANDS:\n \ - setup Enable the *.rs clean/smudge filter in this repo\n \ + setup Enable the *.rs and *.json clean/smudge filter here\n \ inspect [FILE] List top-level defs with a formatting-invariant hash\n \ - filter-process Clean/smudge long-running filter (canonicalizes Rust)\n \ + filter-process Clean/smudge long-running filter (Rust + JSON)\n \ diff-driver Git diff driver (placeholder)\n \ merge-driver Git merge driver (placeholder)\n \ --version, -V Print version\n \ --help, -h Print this help\n\ \n\ - The clean/smudge round-trip works for a documented Rust subset and is\n\ - fail-closed outside it. Structural diff/merge await stable node identity;\n\ - see docs/ for the design and scope." + The clean/smudge round-trip canonicalizes JSON and a documented Rust\n\ + subset, and is fail-closed outside it. Structural diff/merge await stable\n\ + node identity; see docs/ for the design and scope." ); } diff --git a/src/setup.rs b/src/setup.rs index 2fa8f01..5d387be 100644 --- a/src/setup.rs +++ b/src/setup.rs @@ -1,17 +1,19 @@ //! One-command installation of the git-ast filter into a repository. //! //! `git-ast setup` registers the long-running filter in the current repo's git -//! config and ensures `*.rs` is routed through it in `.gitattributes`, so a user -//! can enable the canonical-formatting round-trip without memorizing the config -//! incantation. It is idempotent: re-running it changes nothing. +//! config and ensures the supported languages (`*.rs`, `*.json`) are routed +//! through it in `.gitattributes`, so a user can enable the canonical-formatting +//! round-trip without memorizing the config incantation. It is idempotent: +//! re-running it changes nothing. use crate::Error; use std::path::Path; use std::process::Command; -const ATTR_LINE: &str = "*.rs filter=git-ast"; +/// `.gitattributes` lines routing the supported languages through the filter. +const ATTR_LINES: &[&str] = &["*.rs filter=git-ast", "*.json filter=git-ast"]; -/// Configure the current repository to use git-ast for `*.rs` files. +/// Configure the current repository to use git-ast for the supported languages. pub fn run() -> Result<(), Error> { // The filter invokes this same binary; use its absolute path so the config // keeps working regardless of the caller's PATH. @@ -24,10 +26,10 @@ pub fn run() -> Result<(), Error> { // silently storing unfiltered bytes. git_config("filter.git-ast.required", "true")?; - ensure_attribute()?; + ensure_attributes()?; - eprintln!("git-ast: configured filter for *.rs in this repository."); - eprintln!("git-ast: re-add existing Rust files to canonicalize them: git add --renormalize ."); + eprintln!("git-ast: configured filter for *.rs and *.json in this repository."); + eprintln!("git-ast: re-add existing files to canonicalize them: git add --renormalize ."); Ok(()) } @@ -44,20 +46,24 @@ fn git_config(key: &str, value: &str) -> Result<(), Error> { Ok(()) } -/// Append the `*.rs filter=git-ast` line to `.gitattributes` unless it is -/// already present. -fn ensure_attribute() -> Result<(), Error> { +/// Append each [`ATTR_LINES`] entry to `.gitattributes` unless already present. +fn ensure_attributes() -> Result<(), Error> { let path = Path::new(".gitattributes"); - let existing = std::fs::read_to_string(path).unwrap_or_default(); - if existing.lines().any(|l| l.trim() == ATTR_LINE) { - return Ok(()); - } - let mut updated = existing; - if !updated.is_empty() && !updated.ends_with('\n') { + let mut updated = std::fs::read_to_string(path).unwrap_or_default(); + let mut changed = false; + for line in ATTR_LINES { + if updated.lines().any(|l| l.trim() == *line) { + continue; + } + if !updated.is_empty() && !updated.ends_with('\n') { + updated.push('\n'); + } + updated.push_str(line); updated.push('\n'); + changed = true; + } + if changed { + std::fs::write(path, updated)?; } - updated.push_str(ATTR_LINE); - updated.push('\n'); - std::fs::write(path, updated)?; Ok(()) } diff --git a/tests/claims.rs b/tests/claims.rs index 6c492f7..db4153f 100644 --- a/tests/claims.rs +++ b/tests/claims.rs @@ -67,7 +67,11 @@ async fn install(world: &mut AstWorld) { let process = format!("{} filter-process", env!("CARGO_BIN_EXE_git-ast")); run(&["config", "filter.git-ast.process", &process]); run(&["config", "filter.git-ast.required", "true"]); - std::fs::write(dir.join(".gitattributes"), "*.rs filter=git-ast\n").expect("write attrs"); + std::fs::write( + dir.join(".gitattributes"), + "*.rs filter=git-ast\n*.json filter=git-ast\n", + ) + .expect("write attrs"); world.repo = Some(repo); } diff --git a/tests/features/claims.feature b/tests/features/claims.feature index 7709dfd..17fc4ca 100644 --- a/tests/features/claims.feature +++ b/tests/features/claims.feature @@ -67,3 +67,48 @@ Feature: git-ast canonical clean/smudge round-trip Scenario: Non-Rust files pass through unchanged When I stage "notes.txt" containing " spaced text " Then the stored blob for "notes.txt" is " spaced text " + + Scenario: JSON reformatting never reaches history + When I stage "config.json" containing: + """ + { "b": 1, "a": 2 } + """ + And I commit + And I overwrite "config.json" with: + """ + { + "a": 2, + "b": 1 + } + """ + Then "config.json" shows no diff + + Scenario: Different JSON formattings store byte-identical blobs + When I stage "x.json" containing: + """ + {"a":1,"b":2} + """ + And I stage "y.json" containing: + """ + { "b": 2, + "a": 1 } + """ + Then the stored blobs for "x.json" and "y.json" are identical + + Scenario: JSON round-trip restores canonical source on checkout + When I stage "config.json" containing: + """ + { "b": 1, "a": 2 } + """ + And I commit + And I check out "config.json" fresh + Then the working file "config.json" is: + """ + { + "a": 2, + "b": 1 + } + """ + + Scenario: Invalid JSON is rejected (fail-closed) + Then staging "bad.json" containing "{ not valid }" is rejected