From 6098e42141e24e3cf0000df0161e19fae860a8bf Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 17:50:14 -0600 Subject: [PATCH 01/10] Plan file_tree inspector output --- product/specs/file-tree-inspector-plan.md | 198 ++++++++++++++ product/specs/file-tree-inspector-spec.md | 307 ++++++++++++++++++++++ 2 files changed, 505 insertions(+) create mode 100644 product/specs/file-tree-inspector-plan.md create mode 100644 product/specs/file-tree-inspector-spec.md diff --git a/product/specs/file-tree-inspector-plan.md b/product/specs/file-tree-inspector-plan.md new file mode 100644 index 0000000..d8f65ed --- /dev/null +++ b/product/specs/file-tree-inspector-plan.md @@ -0,0 +1,198 @@ +# Plan - file tree inspector output +> Spec: [File tree inspector output](./file-tree-inspector-spec.md) +> **Status: planning.** + +## Current State + +- `internal/inspect/source.go` walks the raw source tree into `SourceView.files`. + It records relative path, directory, and extension for every non-hidden file. + `file_tree` reads this metadata and opens no files. +- `internal/inspect/inspectors_source.go` implements `FileTree.Inspect` by + grouping refs per directory, turning each directory into feature tokens with + `dirFeatures`, and passing those profiles to `summarize`. +- `internal/inspect/filemeta.go` provides the existing path primitive: + extension histogram, basic casing buckets (`kebab`, `snake`, `other`), + `with_spaces`, and max depth over a set of refs. +- `internal/inspect/summarize.go` clusters profiles into `classes` and + `outliers`. `file_tree_content` and `document_shape` still use this model. +- `internal/inspect/render.go` renders every inspector through the generic + Markdown key/value renderer. It has no `file_tree`-specific projection. +- `internal/inspect/source_test.go` proves `file_tree` applies to filesystem + storage, opens no files, and produces directory profile classes. +- `cmd/inspect_test.go` snapshots the source-layer Markdown report and verifies + JSON parseability, output-file parity, truncation, and `-v`. + +## Sequencing + +| Phase | Focus | Scope | +|---|---|---| +| 1 | Failing contracts | source inspector unit tests, CLI snapshots, JSON shape expectations | +| 2 | Filesystem summary model | path-derived summary helpers, thresholds, naming buckets, representative paths | +| 3 | Inspector payload | `FileTree.Inspect` returns structured map evidence, no file reads | +| 4 | Markdown rendering | `file_tree` projection, default caps, verbose expansion | +| 5 | Docs and verification | inspector docs, snapshots, focused test suite | + +The order keeps the TDD contract visible. First pin the desired behavior, then +replace the evidence model, then teach the renderer how to present it. + +## Phases + +### Phase 1 - Failing contracts + +**Goal:** the suite describes the new filesystem-map behavior before production +code changes. + +1. **File:** `internal/inspect/source_test.go`. + Replace the directory-class assertion in + `TestFileTree_opensNothingAndProfilesDirs` with assertions for structured + filesystem facts: total files, directory count, max depth, extension counts, + and top-level regions. Keep the `ParseCount() == 0` assertion. +2. **File:** `internal/inspect/filemeta_test.go`. + Add tests for the richer path classifiers: extension histogram, depth, + directory counts, top-level region selection, naming bucket classification, + and deterministic representative path selection. +3. **File:** `cmd/inspect_test.go`. + Add CLI tests for: + - tiny tree output includes a tree-like listing + - medium tree output includes top-level regions + - default output caps long summaries with a `pass -v` notice + - verbose output expands the capped filesystem evidence +4. **File:** `cmd/testdata/snapshots/inspect/source-report.txt`. + Update the source-layer snapshot after the failing test captures the new + expected shape. + +### Phase 2 - Filesystem summary model + +**Goal:** path-derived facts are computed once, deterministically, without +opening files. + +1. **File:** `internal/inspect/filetree.go` (new). + Add a `fileTreeSummary` builder over `SourceView.files`. Return a + `map[string]any` payload for `Evidence.Data` while keeping typed internal + structs for construction if useful. +2. **File:** `internal/inspect/filetree.go` (new). + Compute whole-tree facts: `file_count`, `dir_count`, `max_depth`, + `extensions`, top-level regions, directory summaries, and tree entries for + small trees. +3. **File:** `internal/inspect/filetree.go` (new). + Add deterministic threshold helpers for small tree, major region, dominant + extension, Markdown-heavy directory, and dominant naming bucket. +4. **File:** `internal/inspect/filetree.go` (new). + Add representative path selection: prefer different top-level regions, sort + lexicographically within each region, cap the returned list, and record hidden + counts. +5. **File:** `internal/inspect/filemeta.go`. + Either expand the existing casing helper or move the naming classifier into + `filetree.go`. Support `kebab-case`, `snake_case`, `camelCase`, `PascalCase`, + `title/spaces`, `lowercase`, `uppercase`, `numeric`, and `mixed/other`. + +### Phase 3 - Inspector payload + +**Goal:** `file_tree` emits filesystem-map evidence instead of profile clusters. + +1. **File:** `internal/inspect/inspectors_source.go`. + Change `FileTree.Inspect` to call the new summary builder and return + `Evidence{Inspector: "file_tree", Scope: v.root, N: v.N(), Data: summary}`. +2. **File:** `internal/inspect/inspectors_source.go`. + Leave `FileTreeContent` and `DocumentShape` on `summarize`. Do not change + their payloads in this issue. +3. **File:** `internal/inspect/inspectors_source.go`. + Remove `dirFeatures` only if no production code or tests still use it after + `file_tree` moves off directory clustering. +4. **File:** `internal/inspect/source_test.go`. + Keep the storage applicability and no-parse assertions green against the new + payload. +5. **File:** `cmd/inspect_test.go`. + Ensure `TestInspect_jsonEmitsSameEvidence` still passes and add direct JSON + assertions for complete `file_tree` fields. + +### Phase 4 - Markdown rendering + +**Goal:** default output reads as a filesystem map, while verbose output shows +the supporting evidence. + +1. **File:** `internal/inspect/render.go`. + Add an inspector-specific branch for `file_tree` before the generic + `dataLines` path. Keep the generic path for all other inspectors. +2. **File:** `internal/inspect/render.go`. + Render default `file_tree` Markdown as: + - overview sentence + - tree listing for small trees + - top-level regions for larger trees + - top file types + - naming summary when a dominant pattern exists + - capped representative paths or exceptions +3. **File:** `internal/inspect/render.go`. + Treat `maxLines <= 0` as expanded output for `file_tree`, matching today's + `-v` behavior. Expanded output shows fuller region, extension, directory, and + naming evidence. +4. **File:** `internal/inspect/render.go`. + Make hidden-data notices explicit and actionable: `pass -v to show all`. +5. **File:** `internal/inspect/render_test.go`. + Add renderer tests for default capping, verbose expansion, and preservation + of generic rendering for other inspectors. + +### Phase 5 - Docs and verification + +**Goal:** docs explain the new boundary and the focused suite verifies behavior. + +1. **File:** `docs/content/deep-dives/inspectors.md`. + Update the raw-source inspector explanation: `file_tree` is the filesystem + map, `file_tree_content` is content facts, and `document_shape` is document + grouping. +2. **File:** `internal/inspect/doc.go`. + Align package-level wording if it implies all source inspectors use profile + clustering. +3. **File:** `cmd/testdata/snapshots/inspect/source-report.txt`. + Regenerate the source report snapshot after the renderer lands. +4. **Validation:** run + `go test ./internal/inspect ./cmd`. + If renderer or CLI changes ripple farther, run the broader targeted suite + used for inspect work. + +## Key Files + +| File | Role | +|---|---| +| `internal/inspect/source.go` | owns `SourceView.files`, the no-read path metadata that feeds `file_tree` | +| `internal/inspect/inspectors_source.go` | changes `FileTree.Inspect` from clustering to filesystem summary evidence | +| `internal/inspect/filetree.go` | new filesystem-map summary builder, thresholds, regions, naming buckets, and representatives | +| `internal/inspect/filemeta.go` | existing path metadata helper, either expanded or left as a compatibility primitive | +| `internal/inspect/render.go` | adds the `file_tree` Markdown projection and keeps generic rendering for other inspectors | +| `internal/inspect/source_test.go` | verifies no file reads and structured filesystem facts | +| `internal/inspect/filemeta_test.go` | verifies path-derived helper behavior | +| `internal/inspect/render_test.go` | verifies default and verbose `file_tree` rendering | +| `cmd/inspect_test.go` | verifies CLI output, JSON, snapshots, and verbosity behavior | +| `cmd/testdata/snapshots/inspect/source-report.txt` | pins the source-layer first-run report | +| `docs/content/deep-dives/inspectors.md` | documents the raw-source layering and evidence boundary | +| `internal/inspect/doc.go` | package-level architecture summary if wording needs alignment | + +## Architecture Decisions + +| Decision | Choice | Rationale | +|---|---|---| +| Inspector identity | keep `file_tree` | callers and docs already know this inspector name | +| Evidence model | structured filesystem summary | a map needs counts, regions, examples, and naming evidence, not profile clusters | +| Existing summarizer | keep for `file_tree_content` and `document_shape` | those inspectors still answer similarity questions in this issue's scope | +| Renderer | inspector-specific Markdown branch | generic key/value rendering cannot produce a readable tree or capped region summary | +| Verbosity | reuse `-v` / `--max-lines 0` as expanded output | avoids a new CLI flag while matching current inspect behavior | +| Claims | deterministic path-derived observations only | keeps `file_tree` from overlapping parse and document-shape inspectors | +| JSON | complete structured evidence | tools need the full deterministic payload even when Markdown is capped | + +## Documentation updates + +- **Phase 5, File:** `docs/content/deep-dives/inspectors.md`. Describe the + raw-source layering as filesystem map, content facts, document grouping. +- **Phase 5, File:** `internal/inspect/doc.go`. Align package architecture + wording if it says every raw-source inspector emits profile classes. +- **Phase 5, File:** `docs/content/reference/inspectors/`. Regenerate with + `make docs-gen` only if the registry descriptor summary changes. + +## Out of Scope + +- `file_tree_content` output changes. +- `document_shape` output changes. +- Semantic labels such as blog, wiki, docs site, or collection. +- Schema recommendations. +- A new inspect verbosity flag. +- Collection-layer inspector rendering. diff --git a/product/specs/file-tree-inspector-spec.md b/product/specs/file-tree-inspector-spec.md new file mode 100644 index 0000000..971fe8f --- /dev/null +++ b/product/specs/file-tree-inspector-spec.md @@ -0,0 +1,307 @@ +# Spec - file tree inspector output + +> **Status: planning.** Tracks [#110](https://github.com/abegong/katalyst/issues/110). + +## Problem + +`katalyst inspect .` starts with the raw-source inspectors. `file_tree` is the +first layer of that read: it sees paths, names, extensions, depth, and directory +shape before Katalyst opens any file. + +The current `file_tree` output answers a clustering question. It groups +directory profiles into `classes` and `outliers`. That hides the filesystem map +the reader needs first: what files are present, where they live, which regions +dominate, and which path-level conventions are visible. + +## Goal + +`file_tree` should render a deterministic filesystem map for humans and agents. +It should help the reader answer these questions at a glance: + +- How many files and directories are present? +- How deep and wide is the tree? +- What is the actual tree for a small directory? +- What are the top-level regions for a larger directory? +- Which extensions dominate? +- Which directories are dense or Markdown-heavy by extension only? +- What filename conventions are visible from paths? +- Which concrete paths support the summary? + +## Boundary + +`file_tree` inspects only filesystem metadata: + +- relative paths +- directory structure +- filenames +- extensions +- path depth +- counts and ratios derived from paths + +It must not report: + +- Markdown parse status +- frontmatter presence or keys +- body sections or headings +- valid content +- candidate collections +- schemas +- framework or project type + +Good output says `content/books/ contains 24 Markdown files, mostly +kebab-case.` It does not say `content/books/ is a book review collection.` + +## Design + +Replace the current `classes` / `outliers` evidence for `file_tree` with a +structured filesystem summary. Keep the inspector name, registry entry, and +"opens no files" contract. + +The summary should include: + +- `file_count` +- `dir_count` +- `max_depth` +- extension histogram +- top-level regions with descendant file counts and dominant extensions +- directory summaries for verbose output and JSON +- naming buckets with counts +- representative paths +- small-tree entries when the tree is compact +- structural exception candidates tied to a stated dominant pattern + +All observations must be backed by deterministic rules and visible counts. + +Suggested thresholds: + +- Small tree: `file_count <= 30` and `dir_count <= 12`. +- Major region: a top-level directory in the top 8 by descendant file count, or + any top-level directory with at least 10% of files. +- Dominant extension: one extension has at least 60% of files and at least 3 + files. +- Markdown-heavy directory: a directory has at least 3 `.md` files and `.md` is + at least 60% of files under that directory. +- Dominant naming bucket: one bucket has at least 80% of comparable files and at + least 3 files. +- Naming exception: a file outside a dominant naming bucket, capped in default + output. +- Deep path: path depth greater than the 90th percentile or greater than a fixed + threshold such as 4. Pick the simpler stable rule during implementation. + +These thresholds are conservative. The inspector should say less rather than +overclaim. + +## Naming buckets + +Classify filename stems into deterministic buckets: + +- `kebab-case` +- `snake_case` +- `camelCase` +- `PascalCase` +- `title/spaces` +- `lowercase` +- `uppercase` +- `numeric` +- `mixed/other` + +Only compare regular files with non-empty stems. Directory naming is a separate +classifier if implementation needs it; file naming is the default contract. + +## Verbosity + +Default Markdown output is a skimmable map: + +- overview sentence +- actual tree for small trees +- top-level regions for larger trees +- top 5 extensions +- naming summary only when a dominant pattern or clear exception exists +- capped representative paths or exception examples +- explicit hidden-data notices + +Default output should not include exhaustive directory tables, full histograms, +or long exception lists. + +Verbose Markdown shows more evidence: + +- more regions +- full extension histogram +- directory density table +- naming buckets with counts +- more representative paths +- deepest paths +- more exception examples + +JSON remains complete and parseable. It should not be truncated by Markdown +verbosity. + +It should include the evidence downstream tools need even when Markdown output is +capped: + +- file count +- directory count +- max depth +- extension histogram +- top-level regions and counts +- directory summaries +- naming buckets +- representative paths +- exception candidates +- tree entries or enough path data to reconstruct the tree + +## Default Markdown shape + +For a small tree: + +```markdown +### file_tree (n=7) + +Filesystem map: 7 files in 4 directories, max depth 2. Most files are Markdown. + +Tree: +. +├── README.md +├── books/ +│ ├── dune.md +│ └── it.md +└── notes/ + └── meeting-2026-06-24.md + +File types: +- .md: 5 +- .png: 1 +- no extension: 1 + +Naming: +- Markdown filenames are mostly kebab-case: 4 of 5 files. +``` + +For a larger tree: + +```markdown +### file_tree (n=184) + +Filesystem map: 184 files in 26 directories, max depth 5. Markdown is the dominant extension: 128 of 184 files. + +Top-level regions: +- docs/ - 54 files, mostly .md +- content/ - 86 files, mostly .md +- static/ - 31 files, mostly .png, .css +- scripts/ - 6 files, mostly .sh +- ... 4 more top-level entries hidden; pass -v to show all + +File types: +- .md: 128 +- .png: 18 +- .yml: 9 +- .css: 6 +- .sh: 5 +- ... 7 more extensions hidden; pass -v to show all + +Naming: +- Markdown filenames are mostly kebab-case: 113 of 128 files. +- 6 Markdown files contain spaces, for example `docs/Old Notes.md`. +``` + +Use ASCII-only tree rendering unless the project accepts Unicode tree characters +in CLI snapshots. If Unicode tree characters ship, keep them stable and covered +by snapshots. + +## Exceptions + +Exceptions appear only when there is a stated pattern to be an exception to. + +Good: + +```markdown +Naming: +- Markdown filenames are mostly kebab-case: 113 of 128 files. +- Exceptions include `docs/Old Notes.md` and `content/books/Dune.md`. +``` + +Avoid: + +```markdown +Outliers: +- docs/Old Notes.md +- content/books/Dune.md +``` + +Exception lists are capped in default output and expanded in verbose output. + +## Representative paths + +Representative path selection is deterministic: + +1. Prefer paths from different top-level regions. +2. Within each region, sort lexicographically. +3. Cap the total number shown. +4. State when additional paths are hidden. + +Representative paths ground summary claims without becoming a full file listing. + +## Relationship to other raw-source inspectors + +`file_tree` reports the map: paths, names, extensions, counts, depth, and +density. + +`file_tree_content` reports content facts: parse status, frontmatter presence, +and directory-level content shape. + +`document_shape` reports document grouping: candidate document groups, shared +fingerprints, and document-level exceptions. + +When a claim requires opening a file, leave it to `file_tree_content` or +`document_shape`. + +## Tests + +Cover the behavior with failing tests first: + +- tiny tree renders a tree-like listing +- medium tree renders top-level regions +- dominant extension and dominant naming pattern are reported with counts +- naming exceptions are grounded in paths +- default output is capped and verbose output expands it +- JSON contains complete structured evidence +- `file_tree` still opens no files + +## Acceptance criteria + +- `katalyst inspect . --inspector file_tree` reads as a coherent filesystem map, + not a dump of generic summarizer internals. +- Small directories render an actual tree or tree-like listing. +- Larger directories render top-level regions and capped summaries. +- Interpretive phrases are backed by deterministic thresholds and visible + counts. +- The inspector does not claim parse status, frontmatter, body structure, + schemas, or collections. +- Default Markdown is concise; verbose Markdown shows more evidence; JSON remains + complete. +- Truncation is explicit and tells the user how to see more. +- Snapshot tests cover a tiny tree, a medium tree with multiple top-level + regions, dominant extension and naming patterns, naming exceptions, and a tree + large enough to trigger truncation. +- Unit tests cover extension histograms, directory counts, depth calculation, + naming bucket classification, and deterministic representative path selection. + +## Documentation updates + +Update the inspector deep-dive after the implementation lands: + +- `docs/content/deep-dives/inspectors.md`: describe the raw-source layering as + filesystem map, content facts, document grouping. +- `internal/inspect/doc.go`: keep the package summary aligned with the new + `file_tree` evidence model if it names the old clustering shape. +- Generated inspector reference only changes if registry descriptor wording + changes. + +## Out of Scope + +- Changing `file_tree_content`. +- Changing `document_shape`. +- Adding semantic project or framework labels. +- Adding schema recommendations. +- Adding a new CLI verbosity flag. +- Changing collection-layer inspectors. From f8a07d3db9f015fb7c8f62b12ba7e7a066d37d74 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 19:49:15 -0600 Subject: [PATCH 02/10] Build file tree inspector map --- .../snapshots/inspect/source-report.txt | 26 +- cmd/testdata/snapshots/inspectors/list.txt | 2 +- docs/content/deep-dives/inspectors.md | 28 +- docs/content/reference/inspectors/_index.md | 2 +- .../reference/inspectors/source/_index.md | 2 +- .../reference/inspectors/source/file-tree.md | 2 +- internal/inspect/filetree.go | 484 ++++++++++++++++++ internal/inspect/filetree_test.go | 74 +++ internal/inspect/inspectors_source.go | 13 +- internal/inspect/registry.go | 2 +- internal/inspect/render.go | 349 +++++++++++++ internal/inspect/render_test.go | 72 +++ internal/inspect/source_test.go | 35 +- 13 files changed, 1049 insertions(+), 42 deletions(-) create mode 100644 internal/inspect/filetree.go create mode 100644 internal/inspect/filetree_test.go diff --git a/cmd/testdata/snapshots/inspect/source-report.txt b/cmd/testdata/snapshots/inspect/source-report.txt index 1acdfd4..c446b59 100644 --- a/cmd/testdata/snapshots/inspect/source-report.txt +++ b/cmd/testdata/snapshots/inspect/source-report.txt @@ -22,8 +22,24 @@ _Cluster files into candidate collections by a composite fingerprint of frontmat ### file_tree (n=2) -_Profile each directory's file types, naming, and depth, opening no files._ - -- classes: -- outliers: - - features=[ext:.md, casing:kebab] label=books +_Map files, directories, extensions, regions, and filename conventions, opening no files._ + +summary: + files : 2 + directories : 2 + max depth : 2 + dominant type: - + +tree: +. ++-- books/ + +-- dune.md + +-- it.md + +file types: + TYPE FILES + .md 2 + +representative paths: + books/dune.md + books/it.md diff --git a/cmd/testdata/snapshots/inspectors/list.txt b/cmd/testdata/snapshots/inspectors/list.txt index 1cf4e28..5050158 100644 --- a/cmd/testdata/snapshots/inspectors/list.txt +++ b/cmd/testdata/snapshots/inspectors/list.txt @@ -1,7 +1,7 @@ Raw-source inspectors (3) ------------------------- - file_tree - Profile each directory's file types, naming, and depth, opening no files. + Map files, directories, extensions, regions, and filename conventions, opening no files. - file_tree_content Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. - document_shape diff --git a/docs/content/deep-dives/inspectors.md b/docs/content/deep-dives/inspectors.md index fd0b68b..55fcbaa 100644 --- a/docs/content/deep-dives/inspectors.md +++ b/docs/content/deep-dives/inspectors.md @@ -44,8 +44,9 @@ inspectors themselves are thin wrappers that point a primitive at an input: are typed but not yet characterized. - **`markdownBody`** - heading-shape and recurring-section facets over a set of bodies. -- **`fileMetadata`** - path-level conventions (type, naming, depth) over a set - of references, opening no files. +- **`fileMetadata` and `fileTree`** - path-level conventions and filesystem + shape (types, naming, depth, regions, directory density) over references, + opening no files. The same `objectFields` primitive runs over a collection's items (collection layer) and over loose-file frontmatter (the `document_shape` fingerprint, raw @@ -67,22 +68,25 @@ why a conclusion holds and decides. ## The determinism dividing line Deterministic measurement is an inspector's job; threshold-picking and -structure-proposing are not. Counting field presence, histogramming types, and -clustering files by a composite fingerprint are all deterministic, all -inspectors. Deciding that 94% is "required", that two near-but-distinct clusters -are one collection, or what to name a schema are all judgment, none of it here. +structure-proposing are not. Counting field presence, histogramming types, +mapping filesystem regions, and clustering files by a composite fingerprint are +all deterministic, all inspectors. Deciding that 94% is "required", that two +near-but-distinct clusters are one collection, or what to name a schema are all +judgment, none of it here. `document_shape` sits on the seam: it groups files with matching fingerprints (deterministic) but leaves the fuzzy "these two classes are the same collection" call to the reader. ## Keeping output small -The summarizing inspectors (`file_tree`, `document_shape`) collapse -near-identical profiles into named classes, so output is proportional to the -number of *distinct* profiles, not the number of directories or files; the rest -are reported as outliers. The collapse tolerance is the first inspector -parameter, in three mutually-exclusive forms: a named detail level, a similarity -proportion, or a max-classes budget. +`file_tree` keeps Markdown output small with deterministic caps: small trees get +an actual tree, while larger trees show top-level regions, dominant extensions, +naming patterns, and representative paths with `-v` for expanded evidence. +Clustering inspectors such as `document_shape` still collapse near-identical +profiles into named classes, so output is proportional to the number of +*distinct* profiles rather than the number of files. The collapse tolerance is +the first inspector parameter, in three mutually-exclusive forms: a named detail +level, a similarity proportion, or a max-classes budget. ## Output diff --git a/docs/content/reference/inspectors/_index.md b/docs/content/reference/inspectors/_index.md index befc612..cad8aaa 100644 --- a/docs/content/reference/inspectors/_index.md +++ b/docs/content/reference/inspectors/_index.md @@ -14,7 +14,7 @@ Inspectors describe the shape of content and return evidence: counts and distrib Raw-source inspectors profile a backend store directly, before any collection configuration: what files are present, how they parse, and how they are named. -- [File tree]({{< relref "source/file-tree.md" >}}): Profile each directory's file types, naming, and depth, opening no files. +- [File tree]({{< relref "source/file-tree.md" >}}): Map files, directories, extensions, regions, and filename conventions, opening no files. - [File tree (deep)]({{< relref "source/file-tree-content.md" >}}): Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. - [Document shape]({{< relref "source/document-shape.md" >}}): Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. diff --git a/docs/content/reference/inspectors/source/_index.md b/docs/content/reference/inspectors/source/_index.md index d7727e1..0846a40 100644 --- a/docs/content/reference/inspectors/source/_index.md +++ b/docs/content/reference/inspectors/source/_index.md @@ -10,6 +10,6 @@ Raw-source inspectors profile a backend store directly, before any collection co Inspectors in this layer: -- [File tree]({{< relref "file-tree.md" >}}): Profile each directory's file types, naming, and depth, opening no files. +- [File tree]({{< relref "file-tree.md" >}}): Map files, directories, extensions, regions, and filename conventions, opening no files. - [File tree (deep)]({{< relref "file-tree-content.md" >}}): Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. - [Document shape]({{< relref "document-shape.md" >}}): Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. diff --git a/docs/content/reference/inspectors/source/file-tree.md b/docs/content/reference/inspectors/source/file-tree.md index b0747c3..c2c0b0a 100644 --- a/docs/content/reference/inspectors/source/file-tree.md +++ b/docs/content/reference/inspectors/source/file-tree.md @@ -15,7 +15,7 @@ source ## Purpose -Profile each directory's file types, naming, and depth, opening no files. +Map files, directories, extensions, regions, and filename conventions, opening no files. ## Usage diff --git a/internal/inspect/filetree.go b/internal/inspect/filetree.go new file mode 100644 index 0000000..fe1d26f --- /dev/null +++ b/internal/inspect/filetree.go @@ -0,0 +1,484 @@ +package inspect + +import ( + "fmt" + "path" + "regexp" + "sort" + "strings" + "unicode" +) + +const ( + smallTreeFileLimit = 30 + smallTreeDirLimit = 12 +) + +var ( + camelPattern = regexp.MustCompile(`^[a-z][A-Za-z0-9]*[A-Z][A-Za-z0-9]*$`) + pascalPattern = regexp.MustCompile(`^[A-Z][A-Za-z0-9]*$`) +) + +type fileTreeSummary struct { + fileCount int + dirCount int + maxDepth int + extensions map[string]int + regions []fileTreeRegion + directories []fileTreeDirectory + naming fileTreeNaming + representativePaths []string + deepPaths []string + treeLines []string + paths []string +} + +type fileTreeRegion struct { + path string + fileCount int + extensions map[string]int + dominantExt string +} + +type fileTreeDirectory struct { + path string + depth int + directFileCount int + descendantFileCount int + extensions map[string]int + markdownHeavy bool +} + +type fileTreeNaming struct { + buckets map[string]int + dominantBucket string + dominantCount int + comparableCount int + exceptions []fileTreeNamingException + byExtension map[string]fileTreeNaming + dominantExtScope string +} + +type fileTreeNamingException struct { + path string + bucket string + ext string +} + +// buildFileTreeSummary computes deterministic filesystem metadata for the +// file_tree inspector. It only uses SourceView's walked path metadata. +func buildFileTreeSummary(v SourceView) map[string]any { + files := append([]sourceFile(nil), v.files...) + sort.Slice(files, func(i, j int) bool { return files[i].rel < files[j].rel }) + + s := fileTreeSummary{ + fileCount: len(files), + extensions: map[string]int{}, + paths: make([]string, 0, len(files)), + } + dirs := map[string]bool{".": true} + regionExts := map[string]map[string]int{} + regionCounts := map[string]int{} + dirDirect := map[string]int{} + dirDesc := map[string]int{} + dirExts := map[string]map[string]int{} + deepThreshold := 4 + + for _, f := range files { + s.paths = append(s.paths, f.rel) + s.extensions[f.ext]++ + depth := pathDepth(f.rel) + if depth > s.maxDepth { + s.maxDepth = depth + } + if depth > deepThreshold { + s.deepPaths = append(s.deepPaths, f.rel) + } + + for _, dir := range ancestorDirs(f.rel) { + dirs[dir] = true + dirDesc[dir]++ + if dirExts[dir] == nil { + dirExts[dir] = map[string]int{} + } + dirExts[dir][f.ext]++ + } + dirDirect[f.dir]++ + + region := topLevelRegion(f.rel) + regionCounts[region]++ + if regionExts[region] == nil { + regionExts[region] = map[string]int{} + } + regionExts[region][f.ext]++ + } + + s.dirCount = len(dirs) + for _, region := range sortedKeys(regionCounts) { + exts := regionExts[region] + s.regions = append(s.regions, fileTreeRegion{ + path: region, + fileCount: regionCounts[region], + extensions: exts, + dominantExt: dominantExtension(exts, regionCounts[region]), + }) + } + sort.SliceStable(s.regions, func(i, j int) bool { + if s.regions[i].fileCount != s.regions[j].fileCount { + return s.regions[i].fileCount > s.regions[j].fileCount + } + return s.regions[i].path < s.regions[j].path + }) + + for _, dir := range sortedKeys(dirs) { + exts := dirExts[dir] + if exts == nil { + exts = map[string]int{} + } + md := exts[".md"] + desc := dirDesc[dir] + s.directories = append(s.directories, fileTreeDirectory{ + path: dir, + depth: dirDepth(dir), + directFileCount: dirDirect[dir], + descendantFileCount: desc, + extensions: exts, + markdownHeavy: md >= 3 && desc > 0 && md*100 >= desc*60, + }) + } + sort.SliceStable(s.directories, func(i, j int) bool { + if s.directories[i].descendantFileCount != s.directories[j].descendantFileCount { + return s.directories[i].descendantFileCount > s.directories[j].descendantFileCount + } + return s.directories[i].path < s.directories[j].path + }) + + s.naming = summarizeNaming(files, "") + s.naming.byExtension = map[string]fileTreeNaming{} + for _, ext := range sortedKeys(s.extensions) { + extNaming := summarizeNaming(files, ext) + if extNaming.comparableCount > 0 { + s.naming.byExtension[ext] = extNaming + } + } + s.naming.dominantExtScope = dominantNamingScope(s.naming.byExtension) + s.representativePaths = representativePaths(files, 10) + if len(files) <= smallTreeFileLimit && s.dirCount <= smallTreeDirLimit { + s.treeLines = asciiTree(files) + } + return s.toMap() +} + +func (s fileTreeSummary) toMap() map[string]any { + return map[string]any{ + "file_count": s.fileCount, + "dir_count": s.dirCount, + "max_depth": s.maxDepth, + "extensions": toAnyMap(s.extensions), + "top_level_regions": regionsToAny(s.regions), + "directory_summaries": directoriesToAny(s.directories), + "naming": namingToAny(s.naming), + "representative_paths": stringsToAny(s.representativePaths), + "deep_paths": stringsToAny(s.deepPaths), + "tree_entries": stringsToAny(s.treeLines), + "paths": stringsToAny(s.paths), + } +} + +func regionsToAny(regions []fileTreeRegion) []any { + out := make([]any, 0, len(regions)) + for _, r := range regions { + out = append(out, map[string]any{ + "path": r.path, + "file_count": r.fileCount, + "extensions": toAnyMap(r.extensions), + "dominant_extension": r.dominantExt, + }) + } + return out +} + +func directoriesToAny(dirs []fileTreeDirectory) []any { + out := make([]any, 0, len(dirs)) + for _, d := range dirs { + out = append(out, map[string]any{ + "path": d.path, + "depth": d.depth, + "direct_file_count": d.directFileCount, + "descendant_file_count": d.descendantFileCount, + "extensions": toAnyMap(d.extensions), + "markdown_heavy": d.markdownHeavy, + }) + } + return out +} + +func namingToAny(n fileTreeNaming) map[string]any { + out := map[string]any{ + "buckets": toAnyMap(n.buckets), + "dominant_bucket": n.dominantBucket, + "dominant_count": n.dominantCount, + "comparable_count": n.comparableCount, + "exceptions": namingExceptionsToAny(n.exceptions), + } + if n.dominantExtScope != "" { + out["dominant_extension_scope"] = n.dominantExtScope + } + if len(n.byExtension) > 0 { + byExt := map[string]any{} + for _, ext := range sortedKeys(n.byExtension) { + child := n.byExtension[ext] + child.byExtension = nil + byExt[ext] = namingToAny(child) + } + out["by_extension"] = byExt + } + return out +} + +func namingExceptionsToAny(exceptions []fileTreeNamingException) []any { + out := make([]any, 0, len(exceptions)) + for _, e := range exceptions { + out = append(out, map[string]any{"path": e.path, "bucket": e.bucket, "extension": e.ext}) + } + return out +} + +func stringsToAny(in []string) []any { + out := make([]any, len(in)) + for i, v := range in { + out[i] = v + } + return out +} + +func pathDepth(rel string) int { + if rel == "" || rel == "." { + return 0 + } + return strings.Count(rel, "/") + 1 +} + +func dirDepth(dir string) int { + if dir == "." || dir == "" { + return 0 + } + return strings.Count(dir, "/") + 1 +} + +func ancestorDirs(rel string) []string { + dir := path.Dir(rel) + out := []string{"."} + if dir == "." { + return out + } + parts := strings.Split(dir, "/") + for i := range parts { + out = append(out, strings.Join(parts[:i+1], "/")) + } + return out +} + +func topLevelRegion(rel string) string { + if !strings.Contains(rel, "/") { + return "." + } + return strings.Split(rel, "/")[0] + "/" +} + +func dominantExtension(exts map[string]int, total int) string { + ext, n := dominantInt(exts) + if n >= 3 && total > 0 && n*100 >= total*60 { + return ext + } + return "" +} + +func dominantInt(hist map[string]int) (string, int) { + best, bestN := "", -1 + for _, k := range sortedKeys(hist) { + if hist[k] > bestN { + best, bestN = k, hist[k] + } + } + return best, bestN +} + +func summarizeNaming(files []sourceFile, extFilter string) fileTreeNaming { + n := fileTreeNaming{buckets: map[string]int{}} + var comparable []struct { + file sourceFile + bucket string + } + for _, f := range files { + if extFilter != "" && f.ext != extFilter { + continue + } + stem := strings.TrimSuffix(path.Base(f.rel), path.Ext(f.rel)) + if stem == "" { + continue + } + bucket := namingBucket(stem) + n.buckets[bucket]++ + n.comparableCount++ + comparable = append(comparable, struct { + file sourceFile + bucket string + }{file: f, bucket: bucket}) + } + n.dominantBucket, n.dominantCount = dominantInt(n.buckets) + if n.dominantCount >= 3 && n.comparableCount > 0 && n.dominantCount*100 >= n.comparableCount*80 { + for _, item := range comparable { + if item.bucket != n.dominantBucket { + n.exceptions = append(n.exceptions, fileTreeNamingException{ + path: item.file.rel, + bucket: item.bucket, + ext: item.file.ext, + }) + } + } + } else { + n.dominantBucket = "" + n.dominantCount = 0 + } + return n +} + +func dominantNamingScope(byExt map[string]fileTreeNaming) string { + bestExt, bestN := "", 0 + for _, ext := range sortedKeys(byExt) { + n := byExt[ext] + if n.dominantBucket != "" && n.comparableCount > bestN { + bestExt, bestN = ext, n.comparableCount + } + } + return bestExt +} + +func namingBucket(stem string) string { + switch { + case strings.Contains(stem, " "): + return "title/spaces" + case isAllDigits(stem): + return "numeric" + case isAllLetters(stem) && stem == strings.ToLower(stem): + return "lowercase" + case isAllLetters(stem) && stem == strings.ToUpper(stem): + return "uppercase" + case strings.Contains(stem, "-") && kebabPattern.MatchString(stem): + return "kebab-case" + case strings.Contains(stem, "_") && snakePattern.MatchString(stem): + return "snake_case" + case camelPattern.MatchString(stem): + return "camelCase" + case pascalPattern.MatchString(stem): + return "PascalCase" + default: + return "mixed/other" + } +} + +func isAllDigits(s string) bool { + for _, r := range s { + if !unicode.IsDigit(r) { + return false + } + } + return s != "" +} + +func isAllLetters(s string) bool { + for _, r := range s { + if !unicode.IsLetter(r) { + return false + } + } + return s != "" +} + +func representativePaths(files []sourceFile, cap int) []string { + byRegion := map[string][]string{} + for _, f := range files { + byRegion[topLevelRegion(f.rel)] = append(byRegion[topLevelRegion(f.rel)], f.rel) + } + for _, paths := range byRegion { + sort.Strings(paths) + } + var out []string + regions := sortedKeys(byRegion) + for len(out) < cap { + added := false + for _, region := range regions { + paths := byRegion[region] + if len(paths) == 0 { + continue + } + out = append(out, paths[0]) + byRegion[region] = paths[1:] + added = true + if len(out) == cap { + break + } + } + if !added { + break + } + } + return out +} + +func asciiTree(files []sourceFile) []string { + type node struct { + name string + file bool + children map[string]*node + } + root := &node{name: ".", children: map[string]*node{}} + for _, f := range files { + cur := root + parts := strings.Split(f.rel, "/") + for i, part := range parts { + child := cur.children[part] + if child == nil { + child = &node{name: part, children: map[string]*node{}} + cur.children[part] = child + } + if i == len(parts)-1 { + child.file = true + } + cur = child + } + } + lines := []string{"."} + var walk func(*node, string) + walk = func(n *node, prefix string) { + names := sortedKeys(n.children) + sort.SliceStable(names, func(i, j int) bool { + a, b := n.children[names[i]], n.children[names[j]] + if a.file != b.file { + return !a.file + } + return names[i] < names[j] + }) + for i, name := range names { + child := n.children[name] + last := i == len(names)-1 + connector := "+-- " + nextPrefix := prefix + "| " + if last { + connector = "+-- " + nextPrefix = prefix + " " + } + label := child.name + if !child.file { + label += "/" + } + lines = append(lines, fmt.Sprintf("%s%s%s", prefix, connector, label)) + if !child.file { + walk(child, nextPrefix) + } + } + } + walk(root, "") + return lines +} diff --git a/internal/inspect/filetree_test.go b/internal/inspect/filetree_test.go new file mode 100644 index 0000000..a76789e --- /dev/null +++ b/internal/inspect/filetree_test.go @@ -0,0 +1,74 @@ +package inspect + +import "testing" + +func TestNamingBucket(t *testing.T) { + tests := map[string]string{ + "old-notes": "kebab-case", + "old_notes": "snake_case", + "oldNotes": "camelCase", + "OldNotes": "PascalCase", + "Old Notes": "title/spaces", + "lower": "lowercase", + "UPPER": "uppercase", + "123": "numeric", + "old.Notes": "mixed/other", + } + for stem, want := range tests { + if got := namingBucket(stem); got != want { + t.Errorf("namingBucket(%q) = %q, want %q", stem, got, want) + } + } +} + +func TestBuildFileTreeSummary_regionsNamingAndRepresentatives(t *testing.T) { + view := SourceView{files: []sourceFile{ + {rel: "README", dir: ".", ext: ""}, + {rel: "books/dune-book.md", dir: "books", ext: ".md"}, + {rel: "books/it-review.md", dir: "books", ext: ".md"}, + {rel: "books/messiah-notes.md", dir: "books", ext: ".md"}, + {rel: "books/Old Notes.md", dir: "books", ext: ".md"}, + {rel: "notes/reading-list.md", dir: "notes", ext: ".md"}, + {rel: "static/logo.png", dir: "static", ext: ".png"}, + {rel: "static/site.css", dir: "static", ext: ".css"}, + }} + + data := buildFileTreeSummary(view) + if data["file_count"].(int) != 8 { + t.Fatalf("file_count = %v, want 8", data["file_count"]) + } + if data["dir_count"].(int) != 4 { + t.Errorf("dir_count = %v, want 4", data["dir_count"]) + } + if data["max_depth"].(int) != 2 { + t.Errorf("max_depth = %v, want 2", data["max_depth"]) + } + + regions := data["top_level_regions"].([]any) + first := regions[0].(map[string]any) + if first["path"] != "books/" || first["file_count"].(int) != 4 { + t.Errorf("top region = %v, want books/ with 4 files", first) + } + + naming := data["naming"].(map[string]any) + if naming["dominant_extension_scope"] != ".md" { + t.Fatalf("dominant_extension_scope = %v, want .md", naming["dominant_extension_scope"]) + } + byExt := naming["by_extension"].(map[string]any) + mdNaming := byExt[".md"].(map[string]any) + if mdNaming["dominant_bucket"] != "kebab-case" || mdNaming["dominant_count"].(int) != 4 { + t.Errorf("markdown naming = %v, want kebab-case count 4", mdNaming) + } + exceptions := mdNaming["exceptions"].([]any) + if len(exceptions) != 1 || exceptions[0].(map[string]any)["path"] != "books/Old Notes.md" { + t.Errorf("exceptions = %v, want Old Notes.md", exceptions) + } + + reps := data["representative_paths"].([]any) + want := []string{"README", "books/Old Notes.md", "notes/reading-list.md", "static/logo.png"} + for i, rel := range want { + if reps[i] != rel { + t.Errorf("representative_paths[%d] = %v, want %s", i, reps[i], rel) + } + } +} diff --git a/internal/inspect/inspectors_source.go b/internal/inspect/inspectors_source.go index 857de8a..c92fb17 100644 --- a/internal/inspect/inspectors_source.go +++ b/internal/inspect/inspectors_source.go @@ -7,9 +7,9 @@ import ( "github.com/abegong/katalyst/internal/storage" ) -// FileTree is the shallow, cheap raw-source inspector: a per-directory -// path-level profile (file types, naming, depth) summarized into classes. It -// opens no files. Filesystem-specific. Subsumes the former filesystem_naming. +// FileTree is the shallow, cheap raw-source inspector: a deterministic +// filesystem map from path metadata. It opens no files. Filesystem-specific. +// Subsumes the former filesystem_naming. type FileTree struct{} func (FileTree) Name() string { return "file_tree" } @@ -17,12 +17,7 @@ func (FileTree) Name() string { return "file_tree" } func (FileTree) AppliesTo(t storage.StorageType) bool { return t == storage.Filesystem } func (FileTree) Inspect(v SourceView, p Params) Evidence { - byDir := v.refsByDir() - profiles := make([]Profile, 0, len(byDir)) - for _, dir := range sortedKeys(byDir) { - profiles = append(profiles, Profile{Label: dir, Features: dirFeatures(byDir[dir])}) - } - return Evidence{Inspector: "file_tree", Scope: v.root, N: v.N(), Data: summarize(profiles, p)} + return Evidence{Inspector: "file_tree", Scope: v.root, N: v.N(), Data: buildFileTreeSummary(v)} } // FileTreeContent is the deep raw-source inspector: it parses markdown and diff --git a/internal/inspect/registry.go b/internal/inspect/registry.go index d31dbf4..a4366aa 100644 --- a/internal/inspect/registry.go +++ b/internal/inspect/registry.go @@ -78,7 +78,7 @@ func Descriptors() []Descriptor { Family: "filesystem", Slug: "file-tree", Title: "File tree", - Summary: "Profile each directory's file types, naming, and depth, opening no files.", + Summary: "Map files, directories, extensions, regions, and filename conventions, opening no files.", }, { Name: "file_tree_content", diff --git a/internal/inspect/render.go b/internal/inspect/render.go index 17e943b..9845bfa 100644 --- a/internal/inspect/render.go +++ b/internal/inspect/render.go @@ -3,6 +3,7 @@ package inspect import ( "encoding/json" "fmt" + "sort" "strings" ) @@ -55,6 +56,13 @@ func RenderMarkdown(evs []Evidence, maxLines int) string { if s := Summary(ev.Inspector); s != "" { fmt.Fprintf(&b, "_%s_\n\n", s) } + if ev.Inspector == "file_tree" { + for _, ln := range fileTreeMarkdownLines(ev.Data, maxLines <= 0) { + b.WriteString(ln) + b.WriteByte('\n') + } + continue + } lines := dataLines(ev.Data) if maxLines > 0 && len(lines) > maxLines { hidden := len(lines) - maxLines @@ -75,6 +83,347 @@ func RenderMarkdown(evs []Evidence, maxLines int) string { return b.String() } +func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { + fileCount := asInt(data["file_count"]) + dirCount := asInt(data["dir_count"]) + maxDepth := asInt(data["max_depth"]) + extensions := anyMap(data["extensions"]) + lines := []string{"summary:"} + lines = append(lines, alignRows([][]string{ + {"files", fmt.Sprintf("%d", fileCount)}, + {"directories", fmt.Sprintf("%d", dirCount)}, + {"max depth", fmt.Sprintf("%d", maxDepth)}, + {"dominant type", dominantExtensionSummary(fileCount, extensions)}, + }, " ", ": ")...) + + tree := stringSlice(data["tree_entries"]) + if len(tree) > 0 { + lines = append(lines, "", "tree:") + lines = append(lines, tree...) + lines = append(lines, "") + } else { + regions := anySlice(data["top_level_regions"]) + limit := 5 + if expanded { + limit = len(regions) + } + lines = append(lines, "", "top-level regions:") + rows := [][]string{{"REGION", "FILES", "TYPES"}} + for i, region := range regions { + if i >= limit { + break + } + m := region.(map[string]any) + path := m["path"].(string) + count := asInt(m["file_count"]) + exts := anyMap(m["extensions"]) + rows = append(rows, []string{path, fmt.Sprintf("%d", count), topExtensions(exts, 2)}) + } + lines = append(lines, alignTable(rows, " ")...) + if len(regions) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more top-level entries hidden; pass -v to show all", len(regions)-limit)) + } + lines = append(lines, "") + } + + extLimit := 5 + if expanded { + extLimit = len(extensions) + } + lines = append(lines, "file types:") + lines = append(lines, histogramTableLines(extensions, extLimit, "TYPE", "FILES")...) + if len(extensions) > extLimit { + lines = append(lines, fmt.Sprintf(" ... %d more extensions hidden; pass -v to show all", len(extensions)-extLimit)) + } + + if naming := namingLines(anyMap(data["naming"]), expanded); len(naming) > 0 { + lines = append(lines, "") + lines = append(lines, "naming:") + lines = append(lines, naming...) + } + + if expanded { + lines = append(lines, "") + lines = append(lines, "directory density:") + rows := [][]string{{"DIRECTORY", "FILES", "DIRECT", "NOTES"}} + for _, item := range anySlice(data["directory_summaries"]) { + dir := item.(map[string]any) + label := dir["path"].(string) + count := asInt(dir["descendant_file_count"]) + direct := asInt(dir["direct_file_count"]) + notes := "-" + if heavy, _ := dir["markdown_heavy"].(bool); heavy { + notes = "Markdown-heavy" + } + rows = append(rows, []string{label, fmt.Sprintf("%d", count), fmt.Sprintf("%d", direct), notes}) + } + lines = append(lines, alignTable(rows, " ")...) + if deep := stringSlice(data["deep_paths"]); len(deep) > 0 { + lines = append(lines, "") + lines = append(lines, "deep paths:") + for _, rel := range deep { + lines = append(lines, fmt.Sprintf(" %s", rel)) + } + } + } + + paths := stringSlice(data["representative_paths"]) + if len(paths) > 0 { + limit := 5 + if expanded { + limit = len(paths) + } + lines = append(lines, "") + lines = append(lines, "representative paths:") + for i, rel := range paths { + if i >= limit { + break + } + lines = append(lines, fmt.Sprintf(" %s", rel)) + } + if len(paths) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more representative paths hidden; pass -v to show all", len(paths)-limit)) + } + } + return lines +} + +func dominantExtensionSummary(fileCount int, exts map[string]any) string { + if ext, n := dominantAnyExtension(exts); ext != "" && fileCount > 0 && n >= 3 && n*100 >= fileCount*60 { + return fmt.Sprintf("%s (%d of %d files)", extensionLabel(ext), n, fileCount) + } + return "-" +} + +func histogramTableLines(hist map[string]any, limit int, keyHeader, countHeader string) []string { + items := sortedHistogram(hist) + rows := [][]string{{keyHeader, countHeader}} + for i, item := range items { + if i >= limit { + break + } + rows = append(rows, []string{extensionLabel(item.key), fmt.Sprintf("%d", item.n)}) + } + return alignTable(rows, " ") +} + +func topExtensions(exts map[string]any, limit int) string { + var parts []string + for i, item := range sortedHistogram(exts) { + if i >= limit { + break + } + parts = append(parts, extensionLabel(item.key)) + } + return joinStringsOrDash(parts) +} + +func joinStringsOrDash(parts []string) string { + if len(parts) == 0 { + return "-" + } + return strings.Join(parts, ", ") +} + +func namingLines(naming map[string]any, expanded bool) []string { + if len(naming) == 0 { + return nil + } + scope := "" + if raw, ok := naming["dominant_extension_scope"].(string); ok { + scope = raw + } + active := naming + if scope != "" { + if byExt, ok := naming["by_extension"].(map[string]any); ok { + if scoped, ok := byExt[scope].(map[string]any); ok { + active = scoped + } + } + } + dominantBucket, _ := active["dominant_bucket"].(string) + dominantCount := asInt(active["dominant_count"]) + comparableCount := asInt(active["comparable_count"]) + if dominantBucket == "" { + if !expanded { + return nil + } + lines := []string{" no dominant filename bucket met the threshold"} + return append(lines, bucketTableLines(anyMap(active["buckets"]))...) + } + + subject := "filenames" + if scope != "" { + subject = fmt.Sprintf("%s filenames", extensionLabel(scope)) + } + lines := []string{} + lines = append(lines, alignRows([][]string{ + {"scope", subject}, + {"pattern", dominantBucket}, + {"matches", fmt.Sprintf("%d of %d files", dominantCount, comparableCount)}, + }, " ", ": ")...) + exceptions := anySlice(active["exceptions"]) + limit := 2 + if expanded { + limit = len(exceptions) + } + if len(exceptions) > 0 { + var examples []string + for i, item := range exceptions { + if i >= limit { + break + } + m := item.(map[string]any) + examples = append(examples, m["path"].(string)) + } + lines = append(lines, " exceptions:") + for _, example := range examples { + lines = append(lines, fmt.Sprintf(" %s", example)) + } + if len(exceptions) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more naming exceptions hidden; pass -v to show all", len(exceptions)-limit)) + } + } + if expanded { + lines = append(lines, bucketTableLines(anyMap(active["buckets"]))...) + } + return lines +} + +func bucketTableLines(buckets map[string]any) []string { + return append([]string{"", " filename buckets:"}, histogramTableLines(buckets, len(buckets), "BUCKET", "FILES")...) +} + +func alignRows(rows [][]string, prefix, sep string) []string { + width := 0 + for _, row := range rows { + if len(row) > 0 && len(row[0]) > width { + width = len(row[0]) + } + } + lines := make([]string, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + lines = append(lines, fmt.Sprintf("%s%-*s%s%s", prefix, width, row[0], sep, row[1])) + } + return lines +} + +func alignTable(rows [][]string, prefix string) []string { + if len(rows) == 0 { + return nil + } + cols := 0 + for _, row := range rows { + if len(row) > cols { + cols = len(row) + } + } + widths := make([]int, cols) + for _, row := range rows { + for i, cell := range row { + if len(cell) > widths[i] { + widths[i] = len(cell) + } + } + } + lines := make([]string, 0, len(rows)) + for _, row := range rows { + var b strings.Builder + b.WriteString(prefix) + for i := 0; i < cols; i++ { + cell := "" + if i < len(row) { + cell = row[i] + } + if i > 0 { + b.WriteString(" ") + } + fmt.Fprintf(&b, "%-*s", widths[i], cell) + } + lines = append(lines, strings.TrimRight(b.String(), " ")) + } + return lines +} + +type histogramItem struct { + key string + n int +} + +func sortedHistogram(hist map[string]any) []histogramItem { + items := make([]histogramItem, 0, len(hist)) + for k, v := range hist { + items = append(items, histogramItem{key: k, n: asInt(v)}) + } + sort.SliceStable(items, func(i, j int) bool { + if items[i].n != items[j].n { + return items[i].n > items[j].n + } + return items[i].key < items[j].key + }) + return items +} + +func dominantAnyExtension(hist map[string]any) (string, int) { + items := sortedHistogram(hist) + if len(items) == 0 { + return "", 0 + } + return items[0].key, items[0].n +} + +func extensionLabel(ext string) string { + if ext == "" { + return "no extension" + } + return ext +} + +func asInt(v any) int { + switch x := v.(type) { + case int: + return x + case float64: + return int(x) + default: + return 0 + } +} + +func anyMap(v any) map[string]any { + if m, ok := v.(map[string]any); ok { + return m + } + return map[string]any{} +} + +func anySlice(v any) []any { + if s, ok := v.([]any); ok { + return s + } + return nil +} + +func stringSlice(v any) []string { + switch x := v.(type) { + case []string: + return x + case []any: + out := make([]string, 0, len(x)) + for _, item := range x { + if s, ok := item.(string); ok { + out = append(out, s) + } + } + return out + default: + return nil + } +} + // dataLines renders one inspector's Data to individual Markdown lines, so the // caller can count and truncate them. func dataLines(data map[string]any) []string { diff --git a/internal/inspect/render_test.go b/internal/inspect/render_test.go index a2976c7..f8e1fc0 100644 --- a/internal/inspect/render_test.go +++ b/internal/inspect/render_test.go @@ -58,6 +58,78 @@ func TestRenderMarkdown_truncatesPerInspector(t *testing.T) { } } +func TestRenderMarkdown_fileTreeSmallTree(t *testing.T) { + ev := inspect.Evidence{Inspector: "file_tree", Scope: "repo", N: 3, Data: map[string]any{ + "file_count": 3, + "dir_count": 2, + "max_depth": 2, + "extensions": map[string]any{".md": 3}, + "top_level_regions": []any{ + map[string]any{"path": "books/", "file_count": 3, "extensions": map[string]any{".md": 3}}, + }, + "tree_entries": []any{".", "+-- books/", " +-- dune.md", " +-- it.md"}, + "representative_paths": []any{"books/dune.md", "books/it.md"}, + "directory_summaries": []any{}, + "naming": map[string]any{ + "dominant_bucket": "lowercase", + "dominant_count": 3, + "comparable_count": 3, + "buckets": map[string]any{"lowercase": 3}, + "exceptions": []any{}, + }, + }} + md := inspect.RenderMarkdown([]inspect.Evidence{ev}, 20) + for _, want := range []string{ + "summary:", + " files : 3", + " dominant type: .md (3 of 3 files)", + "tree:", + "+-- books/", + "file types:", + " TYPE FILES", + " .md 3", + "naming:", + " pattern: lowercase", + } { + if !strings.Contains(md, want) { + t.Errorf("file_tree markdown missing %q\n%s", want, md) + } + } +} + +func TestRenderMarkdown_fileTreeVerboseShowsExpandedEvidence(t *testing.T) { + ev := inspect.Evidence{Inspector: "file_tree", Scope: "repo", N: 1, Data: map[string]any{ + "file_count": 1, + "dir_count": 1, + "max_depth": 5, + "extensions": map[string]any{".md": 1, ".png": 1, ".css": 1, ".txt": 1, ".yml": 1, ".json": 1}, + "tree_entries": []any{}, + "top_level_regions": []any{}, + "naming": map[string]any{ + "dominant_bucket": "", + "dominant_count": 0, + "comparable_count": 1, + "buckets": map[string]any{"mixed/other": 1}, + "exceptions": []any{}, + }, + "directory_summaries": []any{ + map[string]any{"path": ".", "descendant_file_count": 1, "direct_file_count": 0, "markdown_heavy": false}, + }, + "deep_paths": []any{"a/b/c/d/e.md"}, + "representative_paths": []any{"a/b/c/d/e.md"}, + }} + defaultMD := inspect.RenderMarkdown([]inspect.Evidence{ev}, 20) + if strings.Contains(defaultMD, "directory density:") { + t.Errorf("default file_tree output should stay skimmable\n%s", defaultMD) + } + verbose := inspect.RenderMarkdown([]inspect.Evidence{ev}, 0) + for _, want := range []string{"directory density:", "deep paths:", " mixed/other 1"} { + if !strings.Contains(verbose, want) { + t.Errorf("verbose file_tree markdown missing %q\n%s", want, verbose) + } + } +} + func TestRenderJSON_roundTrips(t *testing.T) { out, err := inspect.RenderJSON(renderInput()) if err != nil { diff --git a/internal/inspect/source_test.go b/internal/inspect/source_test.go index e54f000..4ffc3df 100644 --- a/internal/inspect/source_test.go +++ b/internal/inspect/source_test.go @@ -6,7 +6,7 @@ import ( "github.com/abegong/katalyst/internal/inspect" ) -func TestFileTree_opensNothingAndProfilesDirs(t *testing.T) { +func TestFileTree_opensNothingAndReportsFilesystemMap(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "notes/dune.md", "---\ntitle: Dune\n---\n# Dune\n\n## Review\n") writeFile(t, dir, "notes/messiah.md", "---\ntitle: Messiah\n---\n# Messiah\n\n## Review\n") @@ -33,9 +33,29 @@ func TestFileTree_opensNothingAndProfilesDirs(t *testing.T) { if ev.Inspector != "file_tree" || ev.Scope != dir { t.Errorf("file_tree evidence = %+v", ev) } - // notes (.md, kebab) and assets (.png) are distinct directory profiles. - if got := classTotal(t, ev); got != 2 { - t.Errorf("distinct directory classes = %d, want 2", got) + if got := ev.Data["file_count"].(int); got != 3 { + t.Errorf("file_count = %d, want 3", got) + } + if got := ev.Data["dir_count"].(int); got != 3 { + t.Errorf("dir_count = %d, want 3", got) + } + if got := ev.Data["max_depth"].(int); got != 2 { + t.Errorf("max_depth = %d, want 2", got) + } + extensions := ev.Data["extensions"].(map[string]any) + if extensions[".md"].(int) != 2 || extensions[".png"].(int) != 1 { + t.Errorf("extensions = %v, want .md=2 .png=1", extensions) + } + regions := ev.Data["top_level_regions"].([]any) + if len(regions) != 2 { + t.Fatalf("regions = %d, want 2", len(regions)) + } + first := regions[0].(map[string]any) + if first["path"] != "notes/" || first["file_count"].(int) != 2 { + t.Errorf("first region = %v, want notes/ with 2 files", first) + } + if len(ev.Data["tree_entries"].([]any)) == 0 { + t.Errorf("small file tree should include tree_entries") } } @@ -80,10 +100,3 @@ func TestDocumentShape_clustersOnCompositeFingerprint(t *testing.T) { t.Errorf("outliers = %d, want 1 (notes, distinct body)", len(outliers)) } } - -// classTotal counts distinct classes (non-singleton classes plus singleton -// outliers) in a summarized evidence payload. -func classTotal(t *testing.T, ev inspect.Evidence) int { - t.Helper() - return len(ev.Data["classes"].([]any)) + len(ev.Data["outliers"].([]any)) -} From 267c1410c84577b6afb2628b1bccaada6d3352fa Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 20:13:11 -0600 Subject: [PATCH 03/10] Use connected unicode tree output --- cmd/testdata/snapshots/inspect/source-report.txt | 6 +++--- internal/inspect/filetree.go | 10 +++++----- internal/inspect/render_test.go | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cmd/testdata/snapshots/inspect/source-report.txt b/cmd/testdata/snapshots/inspect/source-report.txt index c446b59..088c34d 100644 --- a/cmd/testdata/snapshots/inspect/source-report.txt +++ b/cmd/testdata/snapshots/inspect/source-report.txt @@ -32,9 +32,9 @@ summary: tree: . -+-- books/ - +-- dune.md - +-- it.md +└── books/ + ├── dune.md + └── it.md file types: TYPE FILES diff --git a/internal/inspect/filetree.go b/internal/inspect/filetree.go index fe1d26f..5b46cda 100644 --- a/internal/inspect/filetree.go +++ b/internal/inspect/filetree.go @@ -164,7 +164,7 @@ func buildFileTreeSummary(v SourceView) map[string]any { s.naming.dominantExtScope = dominantNamingScope(s.naming.byExtension) s.representativePaths = representativePaths(files, 10) if len(files) <= smallTreeFileLimit && s.dirCount <= smallTreeDirLimit { - s.treeLines = asciiTree(files) + s.treeLines = unicodeTree(files) } return s.toMap() } @@ -427,7 +427,7 @@ func representativePaths(files []sourceFile, cap int) []string { return out } -func asciiTree(files []sourceFile) []string { +func unicodeTree(files []sourceFile) []string { type node struct { name string file bool @@ -463,10 +463,10 @@ func asciiTree(files []sourceFile) []string { for i, name := range names { child := n.children[name] last := i == len(names)-1 - connector := "+-- " - nextPrefix := prefix + "| " + connector := "├── " + nextPrefix := prefix + "│ " if last { - connector = "+-- " + connector = "└── " nextPrefix = prefix + " " } label := child.name diff --git a/internal/inspect/render_test.go b/internal/inspect/render_test.go index f8e1fc0..41947db 100644 --- a/internal/inspect/render_test.go +++ b/internal/inspect/render_test.go @@ -67,7 +67,7 @@ func TestRenderMarkdown_fileTreeSmallTree(t *testing.T) { "top_level_regions": []any{ map[string]any{"path": "books/", "file_count": 3, "extensions": map[string]any{".md": 3}}, }, - "tree_entries": []any{".", "+-- books/", " +-- dune.md", " +-- it.md"}, + "tree_entries": []any{".", "└── books/", " ├── dune.md", " └── it.md"}, "representative_paths": []any{"books/dune.md", "books/it.md"}, "directory_summaries": []any{}, "naming": map[string]any{ @@ -84,7 +84,7 @@ func TestRenderMarkdown_fileTreeSmallTree(t *testing.T) { " files : 3", " dominant type: .md (3 of 3 files)", "tree:", - "+-- books/", + "└── books/", "file types:", " TYPE FILES", " .md 3", From 389653445f9f48cdcaa3b4f336573e1fa705ca7b Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 20:16:17 -0600 Subject: [PATCH 04/10] Separate file tree report sections --- .../snapshots/inspect/source-report.txt | 3 +++ internal/inspect/render.go | 24 +++++++++---------- internal/inspect/render_test.go | 1 + 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cmd/testdata/snapshots/inspect/source-report.txt b/cmd/testdata/snapshots/inspect/source-report.txt index 088c34d..ff93ada 100644 --- a/cmd/testdata/snapshots/inspect/source-report.txt +++ b/cmd/testdata/snapshots/inspect/source-report.txt @@ -30,16 +30,19 @@ summary: max depth : 2 dominant type: - +---------------------------------------- tree: . └── books/ ├── dune.md └── it.md +---------------------------------------- file types: TYPE FILES .md 2 +---------------------------------------- representative paths: books/dune.md books/it.md diff --git a/internal/inspect/render.go b/internal/inspect/render.go index 9845bfa..e4f3337 100644 --- a/internal/inspect/render.go +++ b/internal/inspect/render.go @@ -98,16 +98,15 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { tree := stringSlice(data["tree_entries"]) if len(tree) > 0 { - lines = append(lines, "", "tree:") + lines = appendSection(lines, "tree:") lines = append(lines, tree...) - lines = append(lines, "") } else { regions := anySlice(data["top_level_regions"]) limit := 5 if expanded { limit = len(regions) } - lines = append(lines, "", "top-level regions:") + lines = appendSection(lines, "top-level regions:") rows := [][]string{{"REGION", "FILES", "TYPES"}} for i, region := range regions { if i >= limit { @@ -123,28 +122,25 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { if len(regions) > limit { lines = append(lines, fmt.Sprintf(" ... %d more top-level entries hidden; pass -v to show all", len(regions)-limit)) } - lines = append(lines, "") } extLimit := 5 if expanded { extLimit = len(extensions) } - lines = append(lines, "file types:") + lines = appendSection(lines, "file types:") lines = append(lines, histogramTableLines(extensions, extLimit, "TYPE", "FILES")...) if len(extensions) > extLimit { lines = append(lines, fmt.Sprintf(" ... %d more extensions hidden; pass -v to show all", len(extensions)-extLimit)) } if naming := namingLines(anyMap(data["naming"]), expanded); len(naming) > 0 { - lines = append(lines, "") - lines = append(lines, "naming:") + lines = appendSection(lines, "naming:") lines = append(lines, naming...) } if expanded { - lines = append(lines, "") - lines = append(lines, "directory density:") + lines = appendSection(lines, "directory density:") rows := [][]string{{"DIRECTORY", "FILES", "DIRECT", "NOTES"}} for _, item := range anySlice(data["directory_summaries"]) { dir := item.(map[string]any) @@ -159,8 +155,7 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { } lines = append(lines, alignTable(rows, " ")...) if deep := stringSlice(data["deep_paths"]); len(deep) > 0 { - lines = append(lines, "") - lines = append(lines, "deep paths:") + lines = appendSection(lines, "deep paths:") for _, rel := range deep { lines = append(lines, fmt.Sprintf(" %s", rel)) } @@ -173,8 +168,7 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { if expanded { limit = len(paths) } - lines = append(lines, "") - lines = append(lines, "representative paths:") + lines = appendSection(lines, "representative paths:") for i, rel := range paths { if i >= limit { break @@ -188,6 +182,10 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { return lines } +func appendSection(lines []string, label string) []string { + return append(lines, "", "----------------------------------------", label) +} + func dominantExtensionSummary(fileCount int, exts map[string]any) string { if ext, n := dominantAnyExtension(exts); ext != "" && fileCount > 0 && n >= 3 && n*100 >= fileCount*60 { return fmt.Sprintf("%s (%d of %d files)", extensionLabel(ext), n, fileCount) diff --git a/internal/inspect/render_test.go b/internal/inspect/render_test.go index 41947db..5ba0753 100644 --- a/internal/inspect/render_test.go +++ b/internal/inspect/render_test.go @@ -83,6 +83,7 @@ func TestRenderMarkdown_fileTreeSmallTree(t *testing.T) { "summary:", " files : 3", " dominant type: .md (3 of 3 files)", + "----------------------------------------", "tree:", "└── books/", "file types:", From bf8c1776475b598823e4ba3215feed72c699895a Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 20:18:09 -0600 Subject: [PATCH 05/10] Start file tree report with divider --- cmd/testdata/snapshots/inspect/source-report.txt | 1 + internal/inspect/render.go | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmd/testdata/snapshots/inspect/source-report.txt b/cmd/testdata/snapshots/inspect/source-report.txt index ff93ada..b208fe7 100644 --- a/cmd/testdata/snapshots/inspect/source-report.txt +++ b/cmd/testdata/snapshots/inspect/source-report.txt @@ -24,6 +24,7 @@ _Cluster files into candidate collections by a composite fingerprint of frontmat _Map files, directories, extensions, regions, and filename conventions, opening no files._ +---------------------------------------- summary: files : 2 directories : 2 diff --git a/internal/inspect/render.go b/internal/inspect/render.go index e4f3337..ec8e8ca 100644 --- a/internal/inspect/render.go +++ b/internal/inspect/render.go @@ -88,7 +88,7 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { dirCount := asInt(data["dir_count"]) maxDepth := asInt(data["max_depth"]) extensions := anyMap(data["extensions"]) - lines := []string{"summary:"} + lines := []string{sectionDivider, "summary:"} lines = append(lines, alignRows([][]string{ {"files", fmt.Sprintf("%d", fileCount)}, {"directories", fmt.Sprintf("%d", dirCount)}, @@ -183,9 +183,11 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { } func appendSection(lines []string, label string) []string { - return append(lines, "", "----------------------------------------", label) + return append(lines, "", sectionDivider, label) } +const sectionDivider = "----------------------------------------" + func dominantExtensionSummary(fileCount int, exts map[string]any) string { if ext, n := dominantAnyExtension(exts); ext != "" && fileCount > 0 && n >= 3 && n*100 >= fileCount*60 { return fmt.Sprintf("%s (%d of %d files)", extensionLabel(ext), n, fileCount) From 5419ebc37c942cf151d147cdbd79bf36345b04ac Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 20:21:45 -0600 Subject: [PATCH 06/10] Spec file_content_shape inspector --- .../file-content-shape-inspector-spec.md | 345 ++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 product/specs/file-content-shape-inspector-spec.md diff --git a/product/specs/file-content-shape-inspector-spec.md b/product/specs/file-content-shape-inspector-spec.md new file mode 100644 index 0000000..895da97 --- /dev/null +++ b/product/specs/file-content-shape-inspector-spec.md @@ -0,0 +1,345 @@ +# Spec - file content shape inspector + +> **Status: planning.** Tracks [#111](https://github.com/abegong/katalyst/issues/111). + +## Overview + +`file_content_shape` summarizes shared content structure across a selected set +of files. It is the second raw-source inspection level: `file_tree` maps the +store without opening files, then `file_content_shape` opens selected files and +reports the text, tabular, and tree views Katalyst extracts from them. + +## Value + +Automatic clustering hides the question the reader is asking: "Does this slice +of files behave like a coherent set of items?" `file_content_shape` lets a human +or agent propose a directory, glob, extension filter, or path query, then inspect +the shared evidence for that candidate set. + +The loop is explicit: + +1. Run `katalyst inspect .` to see the store map. +2. Run `katalyst inspect file_content_shape` with a candidate selection such as + `content/books/*.md`, `data/*.csv`, or `path under docs/reference`. +3. Read common structure and variation. +4. Refine the selection or draft a collection from the evidence. + +## Current State + +- `cmd/inspect.go` accepts one `` argument. A configured + collection name runs collection inspectors; a filesystem path runs every + raw-source inspector over the whole directory. +- `internal/inspect/source.go` builds `SourceView.files` from path metadata and + lazily parses only `.md` files through `SourceView.markdown`. +- `internal/inspect/inspectors_source.go` implements `FileTreeContent.Inspect` + by grouping parsed Markdown documents per directory, reducing each directory + to feature tokens (`parsed`, `frontmatter`, `fmkey:`), and passing them + to `summarize`. +- `internal/inspect/registry.go` describes `file_tree_content` as Markdown-only: + "Parse markdown and profile each directory's content shape." +- `internal/inspect/render.go` renders every inspector through the same generic + key/value Markdown renderer. The output exposes `classes` and `outliers` + instead of a readable report. +- `docs/content/deep-dives/inspectors.md` describes `file_tree` and + `document_shape` as summarizing inspectors that collapse profiles into classes. + +That model is too narrow. Markdown is one content source, not the boundary of the +second raw-source inspector. Profiling the entire directory by default also mixes +unrelated files and makes the result less useful. + +## Design + +### Raw-source levels + +The raw-source layer has two primary inspection levels: + +1. **Store map.** `file_tree` opens no files. It reports paths, directories, + extensions, naming, depth, and density. +2. **Content shape.** `file_content_shape` opens a selected set of files and + performs light parsing. It reports shared content views, common structure, + variation, and read or parse issues. + +The earlier `document_shape` clustering idea is deferred from the primary path. +Katalyst can add suggestion and clustering features later, but the core CLI +should first let the reader test explicit selections. + +### Command surface + +`file_content_shape` is addressed as an inspector subcommand, not as a flag on +the root `inspect` command. The user-facing form keeps the inspector name +visible: + +```sh +katalyst inspect file_content_shape +``` + +The command supports path-query selections: + +```sh +katalyst inspect file_content_shape . 'ext = ".csv"' +katalyst inspect file_content_shape . 'path under "docs/reference"' +``` + +The old `file_tree_content` name should not be the long-term user-facing name. +If compatibility matters, keep it as an alias during the transition and render +the report as `file_content_shape`. + +### Selection + +A selection is the set of source files the profile uses as its denominator. The +default selection is every non-hidden file under the inspected directory. This +fallback is useful for discovery, but the intended workflow is to pass a narrower +selection. + +The first cut supports: + +- directory selection: `content/books/` +- glob selection: `content/books/*.md` +- path-level query: `ext = ".csv"` or `path under "docs/reference"` + +Metadata and content predicates are out of scope for the first cut because they +require content reads before selection. + +The profile output always prints the resolved selector label, file count, +directory count, extension mix, and skipped/unsupported count before reporting +content facts. + +### Content views + +The inspector extracts zero or more views from each selected file. A view is an +analysis result, not the file's identity. The same file can produce multiple +views. + +View families: + +- **Text.** Sequential text, paragraphs, lines, headings, section-like markers, + or visible text. +- **Tabular.** Rows and columns, from CSV first. Markdown tables, HTML tables, + and JSON arrays of objects are later parsers. +- **Tree.** Nested structure, from Markdown AST/frontmatter and JSON first. + YAML, TOML, HTML DOM, XML, and code ASTs are later parsers. + +Each view record carries: + +- source path +- family: `text`, `tabular`, or `tree` +- parser id, such as `markdown-body`, `csv`, or `json-tree` +- status: `extracted`, `partial`, `failed`, or `unsupported` +- compact facets for that family + +### First-cut parser scope + +Start with three parser families: + +- Markdown text and tree views from the existing Markdown parser. +- CSV tabular views through the standard library. +- JSON tree views through the standard library. + +If any parser, facet, or selector form turns into a large dependency or a long +implementation detour, move it to a follow-up issue and keep the first cut +focused. The first cut is successful when the command surface, output shape, and +Markdown/JSON/CSV views work end to end. + +### Output shape + +Default Markdown should read like a short report, not an inventory dump. It +should state what was selected, whether the selection looks coherent, what +evidence supports that read, and which files differ. + +For a coherent Markdown selection: + +```markdown +### file_content_shape + +Selection: `content/books/*.md` + +24 files selected from `content/books/`. All 24 are readable Markdown files. +Katalyst extracted text and tree views from every file, plus tabular views from +2 files. + +This selection is coherent: + +- 24/24 files have frontmatter keys: title, status. +- 22/24 have a Review section. +- 24/24 filenames are kebab-case Markdown files. + +Variation: +- tags appears in 18/24 files. +- rating appears in 11/24 files. +- 2 files lack Review: books/foo.md, books/bar.md. + +Text: +- H1 in 24/24 files. +- H2 sections in 22/24 files. + +Tree: +- frontmatter object in 24/24 files. +- common keys: title 24/24, status 24/24, tags 18/24. + +Tabular: +- Markdown tables in 2/24 files. + +Read/parse issues: +- none +``` + +For a CSV selection: + +```markdown +### file_content_shape + +Selection: `data/*.csv` + +12 files selected from `data/`. All 12 parse as CSV. + +This selection is coherent: + +- 12/12 files have columns: id, title, status. +- row count ranges from 8 to 118, median 42. +- 10/12 files include an optional notes column. + +Tabular: +- common columns: id 12/12, title 12/12, status 12/12. +- optional columns: notes 10/12, tags 4/12. + +Read/parse issues: +- none +``` + +For a JSON selection: + +```markdown +### file_content_shape + +Selection: `ext = ".json"` + +9 files selected across 3 directories. All 9 parse as JSON tree views. + +This selection is partly coherent: + +- 7/9 files are top-level objects. +- 7/9 files share keys: id, title, status. +- 2/9 files are arrays and should be profiled separately. + +Tree: +- top-level object: 7/9 files. +- top-level array: 2/9 files. +- common object keys: id 7/7, title 7/7, status 7/7. + +Variation: +- 2 array files: fixtures/books.json, fixtures/movies.json. +``` + +For a broad selection: + +```markdown +### file_content_shape + +Selection: `docs/**` + +142 files selected across 18 directories. Katalyst extracted content views from +106 files and skipped 36 assets or unsupported files. + +This selection is too mixed to profile as one item set: + +- No content view appears across more than 42% of selected files. +- Extensions are mixed: .md 31, .png 18, .json 5, .css 4. + +Variation: +- 18 files are assets or unsupported. +- 5 JSON files share object keys, but they are only 8% of the selection. +``` + +Verbose output expands examples, per-directory breakdowns, and full frequency +tables. JSON remains complete and parseable. + +### Common structure and variation + +The report's most important claim is whether the selection behaves like a +coherent item set. + +Common structure reports high-frequency facts across the selected files: + +- content view families present in most files +- common frontmatter or object keys +- common columns +- common headings or section labels +- common top-level tree shape + +Variation reports meaningful differences: + +- optional keys or columns +- missing sections +- parse failures +- unsupported files inside the selection +- subsets that look coherent but represent a small fraction of the selection + +These sections use counts and denominators. They do not recommend a schema or +collection. + +### Relationship to `document_shape` + +`document_shape` should not be the primary automatic clustering path for this +workflow. A future clustering or suggestion command can propose likely +selections, but `file_content_shape` should profile an explicit selection and +report evidence. + +This keeps Katalyst's primary raw-source flow deterministic and explainable: + +1. map the store +2. choose or query a slice +3. profile the slice + +## Backoff rule + +This spec sets the direction, not a mandate to build every parser and query form +at once. During planning or implementation, any part that becomes expensive, +unclear, or dependency-heavy should move to a follow-up issue. Preserve the +content-shape model and ship the smallest coherent version first. + +Examples of acceptable deferrals: + +- HTML DOM parsing +- code AST parsing +- Markdown table extraction +- JSON array-to-table inference +- YAML parsing +- a full query language +- automatic selection suggestions + +## Open Questions + +_None._ The first cut is intentionally small: inspector subcommand, +directory/glob/path-query selections, Markdown/JSON/CSV parsers, and the +`file_content_shape` user-facing name. + +## Documentation updates + +- `docs/content/deep-dives/inspectors.md`: update the raw-source model from + profile clustering to store map plus content shape. Note that clustering and + suggestions are follow-up features, not the primary flow. +- `internal/inspect/doc.go`: align the package summary if it names Markdown-only + or clustering-specific behavior. +- `docs/content/reference/inspectors/`: regenerate with `make docs-gen` if the + registry descriptor changes from `file_tree_content` to `file_content_shape`. +- `docs/content/reference/cli.md`: document the `inspect file_content_shape` + subcommand and its selection syntax. +- `docs/content/reference/glossary.md`: add `content view` and + `file_content_shape` if those terms survive implementation. + +## Test checklist + +- Selection summary reports selector label, file count, directory count, + extension mix, readable count, and unsupported/skipped count. +- Markdown files produce text and tree views without making Markdown the + top-level output category. +- JSON files produce tree-view common keys. +- CSV files produce tabular column and row-count summaries. +- Broad mixed selections report weak commonality and skipped unsupported files. +- Narrow coherent selections report common structure and variation with + denominators. +- Parse failures are visible and grounded in paths. +- JSON output remains complete and parseable. +- Default Markdown is capped; verbose output expands examples and frequency + tables. + From cf45a1d6f176cf56ec2ae1150dd703e0ea13ab19 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 22:14:53 -0600 Subject: [PATCH 07/10] Revise content shape inspector command surface --- .../file-content-shape-inspector-spec.md | 76 +++++++++++++------ 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/product/specs/file-content-shape-inspector-spec.md b/product/specs/file-content-shape-inspector-spec.md index 895da97..cd09d41 100644 --- a/product/specs/file-content-shape-inspector-spec.md +++ b/product/specs/file-content-shape-inspector-spec.md @@ -19,8 +19,9 @@ the shared evidence for that candidate set. The loop is explicit: 1. Run `katalyst inspect .` to see the store map. -2. Run `katalyst inspect file_content_shape` with a candidate selection such as - `content/books/*.md`, `data/*.csv`, or `path under docs/reference`. +2. Run `katalyst inspect . --inspector file_content_shape --select ...` with a + candidate selection such as `content/books/*.md`, `data/*.csv`, or + `path under docs/reference`. 3. Read common structure and variation. 4. Refine the selection or draft a collection from the evidence. @@ -29,6 +30,14 @@ The loop is explicit: - `cmd/inspect.go` accepts one `` argument. A configured collection name runs collection inspectors; a filesystem path runs every raw-source inspector over the whole directory. +- `cmd/inspect.go` already supports inspector narrowing through the repeatable + `--inspector` flag. That is the natural user-facing hook for + `file_content_shape`; adding an inspector-specific subcommand would create a + second invocation grammar for the same registry. +- `internal/inspect/params.go` carries inspector parameters (`--detail`, + `--similarity`, `--max-classes`) through `inspect.Params`. Inspectors that do + not use a parameter ignore it. Selection can follow that pattern if validation + keeps it scoped to `file_content_shape`. - `internal/inspect/source.go` builds `SourceView.files` from path metadata and lazily parses only `.md` files through `SourceView.markdown`. - `internal/inspect/inspectors_source.go` implements `FileTreeContent.Inspect` @@ -65,24 +74,37 @@ should first let the reader test explicit selections. ### Command surface -`file_content_shape` is addressed as an inspector subcommand, not as a flag on -the root `inspect` command. The user-facing form keeps the inspector name -visible: +`file_content_shape` is a regular source inspector registered in +`internal/inspect/registry.go`, not a Cobra subcommand. The existing `inspect` +shape stays intact: the positional argument selects the root, `--inspector` +selects the inspector, and a new `--select` parameter narrows the file set that +`file_content_shape` profiles. ```sh -katalyst inspect file_content_shape +katalyst inspect --inspector file_content_shape --select ``` -The command supports path-query selections: +Examples: ```sh -katalyst inspect file_content_shape . 'ext = ".csv"' -katalyst inspect file_content_shape . 'path under "docs/reference"' +katalyst inspect . --inspector file_content_shape --select 'content/books/*.md' +katalyst inspect . --inspector file_content_shape --select 'ext = ".csv"' +katalyst inspect . --inspector file_content_shape --select 'path under "docs/reference"' ``` -The old `file_tree_content` name should not be the long-term user-facing name. -If compatibility matters, keep it as an alias during the transition and render -the report as `file_content_shape`. +The first cut should treat `--select` as a parameter owned by +`file_content_shape`: it is valid only when exactly one source inspector is +selected and that inspector is `file_content_shape`. Passing `--select` with a +collection-layer target, with no `--inspector`, with multiple `--inspector` +flags, or with another inspector is a usage error. This keeps the existing +inspect pipeline predictable and avoids making every inspector define selection +semantics. + +The old `file_tree_content` name should not remain the long-term user-facing +name. For the first cut, replace it in the registry with `file_content_shape` +rather than shipping two public names. If callers need compatibility later, add +an alias deliberately with tests that prove JSON and Markdown render the +canonical `file_content_shape` name. ### Selection @@ -104,6 +126,12 @@ The profile output always prints the resolved selector label, file count, directory count, extension mix, and skipped/unsupported count before reporting content facts. +Selection is resolved after the `SourceView` walk and before any content parser +runs, so it is path-derived and opens no files. The resolved selection can be +stored on `inspect.Params` as a small value object (for example `Selection{ +Label, Mode, Pattern}`) and applied by `FileContentShape.Inspect`. Other +inspectors should not see selected subsets in the first cut. + ### Content views The inspector extracts zero or more views from each selected file. A view is an @@ -151,7 +179,7 @@ For a coherent Markdown selection: ```markdown ### file_content_shape -Selection: `content/books/*.md` +selector: content/books/*.md 24 files selected from `content/books/`. All 24 are readable Markdown files. Katalyst extracted text and tree views from every file, plus tabular views from @@ -188,7 +216,7 @@ For a CSV selection: ```markdown ### file_content_shape -Selection: `data/*.csv` +selector: data/*.csv 12 files selected from `data/`. All 12 parse as CSV. @@ -211,7 +239,7 @@ For a JSON selection: ```markdown ### file_content_shape -Selection: `ext = ".json"` +selector: ext = ".json" 9 files selected across 3 directories. All 9 parse as JSON tree views. @@ -235,7 +263,7 @@ For a broad selection: ```markdown ### file_content_shape -Selection: `docs/**` +selector: docs/** 142 files selected across 18 directories. Katalyst extracted content views from 106 files and skipped 36 assets or unsupported files. @@ -309,9 +337,10 @@ Examples of acceptable deferrals: ## Open Questions -_None._ The first cut is intentionally small: inspector subcommand, -directory/glob/path-query selections, Markdown/JSON/CSV parsers, and the -`file_content_shape` user-facing name. +_None._ The first cut is intentionally small: regular inspector registry +addition, `--select` as a scoped inspect parameter, directory/glob/path-query +selections, Markdown/JSON/CSV parsers, and the `file_content_shape` user-facing +name. ## Documentation updates @@ -322,8 +351,9 @@ directory/glob/path-query selections, Markdown/JSON/CSV parsers, and the or clustering-specific behavior. - `docs/content/reference/inspectors/`: regenerate with `make docs-gen` if the registry descriptor changes from `file_tree_content` to `file_content_shape`. -- `docs/content/reference/cli.md`: document the `inspect file_content_shape` - subcommand and its selection syntax. +- `docs/content/reference/cli.md`: document + `katalyst inspect --inspector file_content_shape --select ...` and the + supported selection syntax. - `docs/content/reference/glossary.md`: add `content view` and `file_content_shape` if those terms survive implementation. @@ -331,6 +361,9 @@ directory/glob/path-query selections, Markdown/JSON/CSV parsers, and the - Selection summary reports selector label, file count, directory count, extension mix, readable count, and unsupported/skipped count. +- `--select` is accepted only with + `--inspector file_content_shape` on a source-layer target; invalid combinations + return usage errors. - Markdown files produce text and tree views without making Markdown the top-level output category. - JSON files produce tree-view common keys. @@ -342,4 +375,3 @@ directory/glob/path-query selections, Markdown/JSON/CSV parsers, and the - JSON output remains complete and parseable. - Default Markdown is capped; verbose output expands examples and frequency tables. - From 66a0577312996fd6cd881d0e35003091f140bbf6 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Wed, 24 Jun 2026 22:27:03 -0600 Subject: [PATCH 08/10] Add file content shape inspector --- cmd/inspect.go | 12 + cmd/inspect_test.go | 59 ++++ cmd/testdata/snapshots/help/inspect.txt | 1 + .../snapshots/inspect/source-report.txt | 56 +++- cmd/testdata/snapshots/inspectors/list.txt | 4 +- .../inspectors/show-document_shape.txt | 2 +- docs/content/deep-dives/inspectors.md | 19 +- docs/content/reference/cli.md | 16 + docs/content/reference/inspectors/_index.md | 2 +- .../reference/inspectors/source/_index.md | 2 +- ...-tree-content.md => file-content-shape.md} | 8 +- internal/inspect/filecontentshape.go | 316 ++++++++++++++++++ internal/inspect/inspectors_source.go | 52 --- internal/inspect/params.go | 15 + internal/inspect/registry.go | 10 +- internal/inspect/render.go | 172 ++++++++++ internal/inspect/selection.go | 95 ++++++ internal/inspect/selection_test.go | 26 ++ internal/inspect/source.go | 7 + internal/inspect/source_test.go | 17 +- .../file-content-shape-inspector-plan.md | 206 ++++++++++++ 21 files changed, 1014 insertions(+), 83 deletions(-) rename docs/content/reference/inspectors/source/{file-tree-content.md => file-content-shape.md} (60%) create mode 100644 internal/inspect/filecontentshape.go create mode 100644 internal/inspect/selection.go create mode 100644 internal/inspect/selection_test.go create mode 100644 product/specs/file-content-shape-inspector-plan.md diff --git a/cmd/inspect.go b/cmd/inspect.go index 1ce00f6..2dda334 100644 --- a/cmd/inspect.go +++ b/cmd/inspect.go @@ -19,6 +19,7 @@ func newInspectCmd() *cobra.Command { detail string similarity float64 maxClasses int + selectExpr string ) c := &cobra.Command{ @@ -40,6 +41,12 @@ nothing. Output is Markdown by default; --json emits the same evidence as JSON.` if err != nil { return usageErr(err.Error()) } + if selectExpr != "" { + if len(inspectors) != 1 || inspectors[0] != "file_content_shape" { + return usageErr("--select requires exactly one source inspector: --inspector file_content_shape") + } + params = params.WithSelection(inspect.ParseSelection(selectExpr)) + } evidence, err := runInspect(args[0], inspectors, params) if err != nil { @@ -80,6 +87,8 @@ nothing. Output is Markdown by default; --json emits the same evidence as JSON.` "Summarizer similarity threshold (0–1). Mutually exclusive with --detail/--max-classes.") c.Flags().IntVar(&maxClasses, "max-classes", 0, "Cap the number of summarized classes. Mutually exclusive with --detail/--similarity.") + c.Flags().StringVar(&selectExpr, "select", "", + "Select files for file_content_shape: directory, glob, ext = \".csv\", or path under \"docs\".") return c } @@ -116,6 +125,9 @@ func resolveCollection(arg string) (*project.Project, project.Collection, bool) } func runCollectionLayer(proj *project.Project, c project.Collection, names []string, params inspect.Params) ([]inspect.Evidence, error) { + if params.Selection.Mode != "" { + return nil, usageErr("--select requires a source path target") + } selected, err := selectCollectionInspectors(names) if err != nil { return nil, err diff --git a/cmd/inspect_test.go b/cmd/inspect_test.go index 07687da..1cfa00e 100644 --- a/cmd/inspect_test.go +++ b/cmd/inspect_test.go @@ -126,6 +126,65 @@ func TestInspect_inspectorFlagNarrows(t *testing.T) { } } +func TestInspect_selectRunsFileContentShape(t *testing.T) { + dir := inspectRepo(t) + writeFile(t, dir, "data/books.csv", "title,rating\nDune,5\n") + stdout, _, err := runRoot(t, "inspect", "--json", "--inspector", "file_content_shape", "--select", `ext = ".csv"`, dir) + if err != nil { + t.Fatalf("inspect --select: %v", err) + } + var records []map[string]any + if err := json.Unmarshal([]byte(stdout), &records); err != nil { + t.Fatalf("bad json: %v", err) + } + if len(records) != 1 || records[0]["inspector"] != "file_content_shape" { + t.Fatalf("expected only file_content_shape, got %v", records) + } + ev := records[0]["evidence"].(map[string]any) + if got := ev["file_count"].(float64); got != 1 { + t.Errorf("file_count = %v, want 1 selected CSV file", got) + } + if got := ev["selector"].(string); got != `ext = ".csv"` { + t.Errorf("selector = %q", got) + } +} + +func TestInspect_selectRejectsInvalidCombinations(t *testing.T) { + dir := inspectRepo(t) + tests := [][]string{ + {"inspect", "--select", "books", dir}, + {"inspect", "--inspector", "file_tree", "--select", "books", dir}, + {"inspect", "--inspector", "file_content_shape", "--inspector", "document_shape", "--select", "books", dir}, + } + for _, args := range tests { + _, _, err := runRoot(t, args...) + var coded interface{ Code() int } + if err == nil || !errors.As(err, &coded) || coded.Code() != 2 { + t.Errorf("%v: expected exit 2, got %v", args, err) + } + } +} + +func TestInspect_selectRejectsCollectionTarget(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, ".katalyst/storage/local.yaml", `type: filesystem +root: . +collections: + notes: + path: notes + checks: + - kind: markdown_requires_h1 +`) + writeFile(t, dir, "notes/dune.md", "---\ntitle: Dune\n---\n# Dune\n") + chdir(t, dir) + + _, _, err := runRoot(t, "inspect", "--inspector", "file_content_shape", "--select", "notes", "notes") + var coded interface{ Code() int } + if err == nil || !errors.As(err, &coded) || coded.Code() != 2 { + t.Errorf("expected exit 2 for --select with collection target, got %v", err) + } +} + func TestInspect_writesNothingUnderScope(t *testing.T) { dir := inspectRepo(t) before := countFiles(t, dir) diff --git a/cmd/testdata/snapshots/help/inspect.txt b/cmd/testdata/snapshots/help/inspect.txt index e382b4d..1ed175d 100644 --- a/cmd/testdata/snapshots/help/inspect.txt +++ b/cmd/testdata/snapshots/help/inspect.txt @@ -20,5 +20,6 @@ Flags: --max-classes int Cap the number of summarized classes. Mutually exclusive with --detail/--similarity. --max-lines int Truncate each inspector's Markdown output to N lines (0 = no limit). (default 20) -o, --output string Write the report to a file instead of stdout. + --select string Select files for file_content_shape: directory, glob, ext = ".csv", or path under "docs". --similarity float Summarizer similarity threshold (0–1). Mutually exclusive with --detail/--max-classes. (default -1) -v, --verbose Show full output; do not truncate (same as --max-lines 0). diff --git a/cmd/testdata/snapshots/inspect/source-report.txt b/cmd/testdata/snapshots/inspect/source-report.txt index b208fe7..1092652 100644 --- a/cmd/testdata/snapshots/inspect/source-report.txt +++ b/cmd/testdata/snapshots/inspect/source-report.txt @@ -2,13 +2,59 @@ ## Structural -### file_tree_content (n=2) +### file_content_shape (n=2) -_Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets._ +_Profile selected files by text, tabular, and tree content structure._ -- classes: -- outliers: - - features=[fmkey:status, fmkey:title, parsed, frontmatter] label=books +---------------------------------------- +selection: + expression : all files + files : 2 + directories : 1 + readable : 2 + unsupported : 0 + parse failures: 0 + +---------------------------------------- +file types: + TYPE FILES + .md 2 + +---------------------------------------- +coherence: + status: coherent + +---------------------------------------- +common structure: + - 2/2 Markdown files have an H1 + - 2/2 Markdown files have frontmatter key status + - 2/2 Markdown files have frontmatter key title + - 2/2 Markdown files have section Review + +---------------------------------------- +variation: + none + +---------------------------------------- +text: + files : 2 + with H1: 2 + frontmatter keys: + KEY FILES + status 2 + title 2 + +---------------------------------------- +tabular: + no CSV files selected + +---------------------------------------- +tree: + no JSON files selected + +---------------------------------------- +read/parse issues: + none ### document_shape (n=2) diff --git a/cmd/testdata/snapshots/inspectors/list.txt b/cmd/testdata/snapshots/inspectors/list.txt index 5050158..daf6b3c 100644 --- a/cmd/testdata/snapshots/inspectors/list.txt +++ b/cmd/testdata/snapshots/inspectors/list.txt @@ -2,8 +2,8 @@ Raw-source inspectors (3) ------------------------- - file_tree Map files, directories, extensions, regions, and filename conventions, opening no files. -- file_tree_content - Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. +- file_content_shape + Profile selected files by text, tabular, and tree content structure. - document_shape Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. diff --git a/cmd/testdata/snapshots/inspectors/show-document_shape.txt b/cmd/testdata/snapshots/inspectors/show-document_shape.txt index 6581729..70cc184 100644 --- a/cmd/testdata/snapshots/inspectors/show-document_shape.txt +++ b/cmd/testdata/snapshots/inspectors/show-document_shape.txt @@ -12,4 +12,4 @@ Raw-source inspectors profile a backend store directly, before any collection co Other raw-source inspectors (2) ------------------------------- - file_tree -- file_tree_content +- file_content_shape diff --git a/docs/content/deep-dives/inspectors.md b/docs/content/deep-dives/inspectors.md index 55fcbaa..d1f8d4b 100644 --- a/docs/content/deep-dives/inspectors.md +++ b/docs/content/deep-dives/inspectors.md @@ -21,7 +21,7 @@ Inspectors come in two layers, distinguished by *how they reference the data*: - **The raw-source layer** (`SourceInspector` over a `SourceView`) measures a backend store directly, before any collection configuration, addressed by backend-native reference (a relative path today). It answers "what is in this - store?" - the onboarding case. `file_tree`, `file_tree_content`, and + store?" - the onboarding case. `file_tree`, `file_content_shape`, and `document_shape` live here. - **The collection layer** (`CollectionInspector` over a `CollectionView`) measures a configured collection's items, addressed by domain identity @@ -79,14 +79,15 @@ call to the reader. ## Keeping output small -`file_tree` keeps Markdown output small with deterministic caps: small trees get -an actual tree, while larger trees show top-level regions, dominant extensions, -naming patterns, and representative paths with `-v` for expanded evidence. -Clustering inspectors such as `document_shape` still collapse near-identical -profiles into named classes, so output is proportional to the number of -*distinct* profiles rather than the number of files. The collapse tolerance is -the first inspector parameter, in three mutually-exclusive forms: a named detail -level, a similarity proportion, or a max-classes budget. +`file_tree` and `file_content_shape` keep Markdown output small with +deterministic caps: small trees get an actual tree; content-shape reports show +the selected file set, dominant structures, and compact text/tabular/tree +facets, with `-v` for expanded evidence. Clustering inspectors such as +`document_shape` still collapse near-identical profiles into named classes, so +output is proportional to the number of *distinct* profiles rather than the +number of files. The collapse tolerance is the first inspector parameter, in +three mutually-exclusive forms: a named detail level, a similarity proportion, +or a max-classes budget. ## Output diff --git a/docs/content/reference/cli.md b/docs/content/reference/cli.md index f3057d5..8c8e026 100644 --- a/docs/content/reference/cli.md +++ b/docs/content/reference/cli.md @@ -42,6 +42,22 @@ This applies to human-facing read output such as `list`, detail `show`, and summ `get` output. It does not apply to machine-oriented contracts such as `check` diagnostics, `fix --check` path lists, or `--json` output. +## Inspect selections + +`katalyst inspect --inspector file_content_shape --select ` +profiles only the selected source files. The first cut scopes `--select` to +`file_content_shape`; using it with collection targets, with no inspector, with +multiple inspectors, or with any other inspector is a usage error. + +Supported selections: + +| Form | Meaning | +|---|---| +| `content/books/` | files under a directory prefix | +| `content/books/*.md` | files matching a doublestar-style glob | +| `ext = ".csv"` | files with the given extension | +| `path under "docs/reference"` | files under the quoted path prefix | + ## Exit codes Shared across the validating commands (`check`, `fix --check`): diff --git a/docs/content/reference/inspectors/_index.md b/docs/content/reference/inspectors/_index.md index cad8aaa..d18e490 100644 --- a/docs/content/reference/inspectors/_index.md +++ b/docs/content/reference/inspectors/_index.md @@ -15,7 +15,7 @@ Inspectors describe the shape of content and return evidence: counts and distrib Raw-source inspectors profile a backend store directly, before any collection configuration: what files are present, how they parse, and how they are named. - [File tree]({{< relref "source/file-tree.md" >}}): Map files, directories, extensions, regions, and filename conventions, opening no files. -- [File tree (deep)]({{< relref "source/file-tree-content.md" >}}): Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. +- [File content shape]({{< relref "source/file-content-shape.md" >}}): Profile selected files by text, tabular, and tree content structure. - [Document shape]({{< relref "source/document-shape.md" >}}): Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. ## Collection inspectors diff --git a/docs/content/reference/inspectors/source/_index.md b/docs/content/reference/inspectors/source/_index.md index 0846a40..51bd3e0 100644 --- a/docs/content/reference/inspectors/source/_index.md +++ b/docs/content/reference/inspectors/source/_index.md @@ -11,5 +11,5 @@ Raw-source inspectors profile a backend store directly, before any collection co Inspectors in this layer: - [File tree]({{< relref "file-tree.md" >}}): Map files, directories, extensions, regions, and filename conventions, opening no files. -- [File tree (deep)]({{< relref "file-tree-content.md" >}}): Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. +- [File content shape]({{< relref "file-content-shape.md" >}}): Profile selected files by text, tabular, and tree content structure. - [Document shape]({{< relref "document-shape.md" >}}): Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. diff --git a/docs/content/reference/inspectors/source/file-tree-content.md b/docs/content/reference/inspectors/source/file-content-shape.md similarity index 60% rename from docs/content/reference/inspectors/source/file-tree-content.md rename to docs/content/reference/inspectors/source/file-content-shape.md index 7124ca9..9a026b0 100644 --- a/docs/content/reference/inspectors/source/file-tree-content.md +++ b/docs/content/reference/inspectors/source/file-content-shape.md @@ -1,5 +1,5 @@ +++ -title = "File tree (deep)" +title = "File content shape" weight = 20 +++ @@ -7,7 +7,7 @@ weight = 20 ## Inspector ID -`file_tree_content` +`file_content_shape` ## Layer @@ -15,12 +15,12 @@ source ## Purpose -Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets. +Profile selected files by text, tabular, and tree content structure. ## Usage Inspectors emit evidence: counts and distributions, for the reader to judge. Run this one with: ``` -katalyst inspect --inspector file_tree_content +katalyst inspect --inspector file_content_shape ``` diff --git a/internal/inspect/filecontentshape.go b/internal/inspect/filecontentshape.go new file mode 100644 index 0000000..b710313 --- /dev/null +++ b/internal/inspect/filecontentshape.go @@ -0,0 +1,316 @@ +package inspect + +import ( + "bytes" + "encoding/csv" + "encoding/json" + "fmt" + "sort" + + "github.com/abegong/katalyst/internal/storage" + "github.com/abegong/katalyst/internal/storage/collection/document" +) + +type contentIssue struct { + Path string + Kind string + Detail string +} + +// FileContentShape profiles a selected set of source files by light content +// parsing. It is filesystem-specific through SourceView. +type FileContentShape struct{} + +func (FileContentShape) Name() string { return "file_content_shape" } + +func (FileContentShape) AppliesTo(t storage.StorageType) bool { return t == storage.Filesystem } + +func (FileContentShape) Inspect(v SourceView, p Params) Evidence { + data := buildFileContentShape(v, p.Selection) + return Evidence{Inspector: "file_content_shape", Scope: v.root, N: asInt(data["file_count"]), Data: data} +} + +func buildFileContentShape(v SourceView, sel Selection) map[string]any { + if sel.Mode == "" { + sel = ParseSelection("") + } + files, err := v.selectFiles(sel) + if err != nil { + return map[string]any{ + "selector": sel.Label, + "file_count": 0, + "issues": issuesToAny([]contentIssue{{Kind: "selection", Detail: err.Error()}}), + "coherence": "mixed", + "summary_text": err.Error(), + } + } + + exts := map[string]int{} + dirs := map[string]bool{} + readable, unsupported := 0, 0 + var issues []contentIssue + mdFiles, csvFiles, jsonFiles := 0, 0, 0 + mdKeys, mdSections := map[string]int{}, map[string]int{} + mdH1 := 0 + csvColumns := map[string]int{} + var csvRows []int + jsonTop := map[string]int{} + jsonKeys := map[string]int{} + jsonObjects := 0 + + for _, f := range files { + exts[f.ext]++ + dirs[f.dir] = true + src, err := v.readFile(f.rel) + if err != nil { + issues = append(issues, contentIssue{Path: f.rel, Kind: "read_failed", Detail: err.Error()}) + continue + } + readable++ + switch f.ext { + case ".md": + mdFiles++ + doc, err := document.Parse(src) + if err != nil { + issues = append(issues, contentIssue{Path: f.rel, Kind: "parse_failed", Detail: err.Error()}) + continue + } + if doc.HasFrontmatter { + for k := range doc.Meta { + mdKeys[k]++ + } + } + seenSections := map[string]bool{} + hasH1 := false + for _, h := range headings(doc.Body) { + if h.level == 1 { + hasH1 = true + } + if h.level >= 2 { + seenSections[h.text] = true + } + } + if hasH1 { + mdH1++ + } + for section := range seenSections { + mdSections[section]++ + } + case ".csv": + csvFiles++ + r := csv.NewReader(bytes.NewReader(src)) + records, err := r.ReadAll() + if err != nil { + issues = append(issues, contentIssue{Path: f.rel, Kind: "parse_failed", Detail: err.Error()}) + continue + } + if len(records) == 0 { + csvRows = append(csvRows, 0) + continue + } + seen := map[string]bool{} + for _, col := range records[0] { + seen[col] = true + } + for col := range seen { + csvColumns[col]++ + } + csvRows = append(csvRows, len(records)-1) + case ".json": + jsonFiles++ + var val any + dec := json.NewDecoder(bytes.NewReader(src)) + dec.UseNumber() + if err := dec.Decode(&val); err != nil { + issues = append(issues, contentIssue{Path: f.rel, Kind: "parse_failed", Detail: err.Error()}) + continue + } + shape := jsonShape(val) + jsonTop[shape]++ + if obj, ok := val.(map[string]any); ok { + jsonObjects++ + for k := range obj { + jsonKeys[k]++ + } + } + default: + unsupported++ + issues = append(issues, contentIssue{Path: f.rel, Kind: "unsupported", Detail: "no first-cut content parser for " + extensionLabel(f.ext)}) + } + } + + common, variation := contentCommonVariation(len(files), mdFiles, mdKeys, mdSections, mdH1, csvFiles, csvColumns, csvRows, jsonFiles, jsonTop, jsonObjects, jsonKeys, unsupported) + coherence := contentCoherence(len(files), mdFiles, csvFiles, jsonFiles, unsupported) + return map[string]any{ + "selector": sel.Label, + "file_count": len(files), + "dir_count": len(dirs), + "extensions": toAnyMap(exts), + "readable_count": readable, + "unsupported_count": unsupported, + "parse_failure_count": countIssueKind(issues, "parse_failed"), + "coherence": coherence, + "common_structure": stringsToAny(common), + "variation": stringsToAny(variation), + "markdown": map[string]any{ + "files": mdFiles, + "h1": mdH1, + "frontmatter_keys": toAnyMap(mdKeys), + "sections": toAnyMap(mdSections), + }, + "csv": map[string]any{ + "files": csvFiles, + "columns": toAnyMap(csvColumns), + "row_counts": rowStats(csvRows), + }, + "json": map[string]any{ + "files": jsonFiles, + "top_level_shapes": toAnyMap(jsonTop), + "object_files": jsonObjects, + "common_object_keys": toAnyMap(jsonKeys), + }, + "issues": issuesToAny(issues), + } +} + +func jsonShape(v any) string { + switch v.(type) { + case map[string]any: + return "object" + case []any: + return "array" + case string: + return "string" + case json.Number, float64: + return "number" + case bool: + return "boolean" + case nil: + return "null" + default: + return "unknown" + } +} + +func contentCommonVariation(total, mdFiles int, mdKeys, mdSections map[string]int, mdH1 int, csvFiles int, csvColumns map[string]int, csvRows []int, jsonFiles int, jsonTop map[string]int, jsonObjects int, jsonKeys map[string]int, unsupported int) ([]string, []string) { + var common, variation []string + if mdFiles > 0 { + if mdH1 == mdFiles { + common = append(common, fmt.Sprintf("%d/%d Markdown files have an H1", mdH1, mdFiles)) + } + for _, k := range highFrequency(mdKeys, mdFiles, 0.8) { + common = append(common, fmt.Sprintf("%d/%d Markdown files have frontmatter key %s", mdKeys[k], mdFiles, k)) + } + for _, s := range highFrequency(mdSections, mdFiles, 0.8) { + common = append(common, fmt.Sprintf("%d/%d Markdown files have section %s", mdSections[s], mdFiles, s)) + } + for _, k := range midFrequency(mdKeys, mdFiles) { + variation = append(variation, fmt.Sprintf("frontmatter key %s appears in %d/%d Markdown files", k, mdKeys[k], mdFiles)) + } + } + if csvFiles > 0 { + for _, c := range highFrequency(csvColumns, csvFiles, 0.8) { + common = append(common, fmt.Sprintf("%d/%d CSV files have column %s", csvColumns[c], csvFiles, c)) + } + if stats := rowStats(csvRows); stats["files"].(int) > 0 { + common = append(common, fmt.Sprintf("CSV row count ranges from %d to %d, median %d", stats["min"].(int), stats["max"].(int), stats["median"].(int))) + } + for _, c := range midFrequency(csvColumns, csvFiles) { + variation = append(variation, fmt.Sprintf("column %s appears in %d/%d CSV files", c, csvColumns[c], csvFiles)) + } + } + if jsonFiles > 0 { + for _, shape := range sortedKeys(jsonTop) { + common = append(common, fmt.Sprintf("%d/%d JSON files are top-level %ss", jsonTop[shape], jsonFiles, shape)) + } + for _, k := range highFrequency(jsonKeys, jsonObjects, 0.8) { + common = append(common, fmt.Sprintf("%d/%d JSON object files have key %s", jsonKeys[k], jsonObjects, k)) + } + } + if unsupported > 0 { + variation = append(variation, fmt.Sprintf("%d/%d selected files are unsupported by first-cut parsers", unsupported, total)) + } + return common, variation +} + +func contentCoherence(total, mdFiles, csvFiles, jsonFiles, unsupported int) string { + if total == 0 { + return "mixed" + } + best := mdFiles + if csvFiles > best { + best = csvFiles + } + if jsonFiles > best { + best = jsonFiles + } + if unsupported == 0 && best == total { + return "coherent" + } + if best*100 >= total*60 { + return "partly_coherent" + } + return "mixed" +} + +func highFrequency(hist map[string]int, denom int, threshold float64) []string { + if denom == 0 { + return nil + } + var out []string + for _, k := range sortedKeys(hist) { + if float64(hist[k])/float64(denom) >= threshold { + out = append(out, k) + } + } + return out +} + +func midFrequency(hist map[string]int, denom int) []string { + if denom == 0 { + return nil + } + var out []string + for _, k := range sortedKeys(hist) { + if hist[k] > 0 && hist[k] < denom { + out = append(out, k) + } + } + return out +} + +func rowStats(rows []int) map[string]any { + if len(rows) == 0 { + return map[string]any{"files": 0, "min": 0, "median": 0, "max": 0} + } + sorted := append([]int(nil), rows...) + sort.Ints(sorted) + return map[string]any{ + "files": len(sorted), + "min": sorted[0], + "median": sorted[len(sorted)/2], + "max": sorted[len(sorted)-1], + } +} + +func countIssueKind(issues []contentIssue, kind string) int { + n := 0 + for _, issue := range issues { + if issue.Kind == kind { + n++ + } + } + return n +} + +func issuesToAny(issues []contentIssue) []any { + out := make([]any, 0, len(issues)) + for _, issue := range issues { + m := map[string]any{"kind": issue.Kind, "detail": issue.Detail} + if issue.Path != "" { + m["path"] = issue.Path + } + out = append(out, m) + } + return out +} diff --git a/internal/inspect/inspectors_source.go b/internal/inspect/inspectors_source.go index c92fb17..7b650bd 100644 --- a/internal/inspect/inspectors_source.go +++ b/internal/inspect/inspectors_source.go @@ -20,28 +20,6 @@ func (FileTree) Inspect(v SourceView, p Params) Evidence { return Evidence{Inspector: "file_tree", Scope: v.root, N: v.N(), Data: buildFileTreeSummary(v)} } -// FileTreeContent is the deep raw-source inspector: it parses markdown and -// profiles each directory by content shape (frontmatter key union, parse / -// frontmatter presence), summarized into classes. Subsumes the former -// walk_parse. Filesystem-specific. -type FileTreeContent struct{} - -func (FileTreeContent) Name() string { return "file_tree_content" } - -func (FileTreeContent) AppliesTo(t storage.StorageType) bool { return t == storage.Filesystem } - -func (FileTreeContent) Inspect(v SourceView, p Params) Evidence { - byDir := map[string][]sourceDoc{} - for _, sd := range v.markdown() { - byDir[sd.dir] = append(byDir[sd.dir], sd) - } - profiles := make([]Profile, 0, len(byDir)) - for _, dir := range sortedKeys(byDir) { - profiles = append(profiles, Profile{Label: dir, Features: contentFeatures(byDir[dir])}) - } - return Evidence{Inspector: "file_tree_content", Scope: v.root, N: v.N(), Data: summarize(profiles, p)} -} - // DocumentShape clusters markdown files into candidate collections on a // composite fingerprint: frontmatter keys, body section skeleton, and file // type/naming, so a class agrees on metadata AND structure AND convention, not @@ -76,36 +54,6 @@ func dirFeatures(refs []string) []string { return feats } -// contentFeatures fingerprints a directory by markdown content shape: the union -// of frontmatter keys plus parse/frontmatter presence markers. -func contentFeatures(docs []sourceDoc) []string { - keys := map[string]bool{} - parsed, withFM := 0, 0 - for _, sd := range docs { - if sd.doc == nil { - continue - } - parsed++ - if sd.doc.HasFrontmatter { - withFM++ - for k := range sd.doc.Meta { - keys[k] = true - } - } - } - var feats []string - for _, k := range sortedKeys(keys) { - feats = append(feats, "fmkey:"+k) - } - if parsed > 0 { - feats = append(feats, "parsed") - } - if withFM > 0 { - feats = append(feats, "frontmatter") - } - return feats -} - // shapeFeatures builds a file's composite fingerprint across three dimensions: // file type/naming, frontmatter keys, and body section skeleton. func shapeFeatures(sd sourceDoc) []string { diff --git a/internal/inspect/params.go b/internal/inspect/params.go index 83a71bb..7ff103e 100644 --- a/internal/inspect/params.go +++ b/internal/inspect/params.go @@ -19,6 +19,21 @@ type Params struct { mode collapseMode threshold float64 maxClasses int + Selection Selection +} + +// Selection describes the path-derived file subset an inspector should use. +// Empty mode means "all files". +type Selection struct { + Label string + Mode string + Pattern string +} + +// WithSelection returns a copy of p carrying selection. +func (p Params) WithSelection(selection Selection) Params { + p.Selection = selection + return p } // detailThresholds maps the named --detail levels to similarity thresholds. diff --git a/internal/inspect/registry.go b/internal/inspect/registry.go index a4366aa..cde3344 100644 --- a/internal/inspect/registry.go +++ b/internal/inspect/registry.go @@ -81,12 +81,12 @@ func Descriptors() []Descriptor { Summary: "Map files, directories, extensions, regions, and filename conventions, opening no files.", }, { - Name: "file_tree_content", + Name: "file_content_shape", Layer: "source", Family: "structural", - Slug: "file-tree-content", - Title: "File tree (deep)", - Summary: "Parse markdown and profile each directory's content shape: parse rate, frontmatter, key-sets.", + Slug: "file-content-shape", + Title: "File content shape", + Summary: "Profile selected files by text, tabular, and tree content structure.", }, { Name: "document_shape", @@ -119,7 +119,7 @@ func Descriptors() []Descriptor { func SourceInspectors() []SourceInspector { return []SourceInspector{ FileTree{}, - FileTreeContent{}, + FileContentShape{}, DocumentShape{}, } } diff --git a/internal/inspect/render.go b/internal/inspect/render.go index ec8e8ca..2395481 100644 --- a/internal/inspect/render.go +++ b/internal/inspect/render.go @@ -63,6 +63,13 @@ func RenderMarkdown(evs []Evidence, maxLines int) string { } continue } + if ev.Inspector == "file_content_shape" { + for _, ln := range fileContentShapeMarkdownLines(ev.Data, maxLines <= 0) { + b.WriteString(ln) + b.WriteByte('\n') + } + continue + } lines := dataLines(ev.Data) if maxLines > 0 && len(lines) > maxLines { hidden := len(lines) - maxLines @@ -182,6 +189,171 @@ func fileTreeMarkdownLines(data map[string]any, expanded bool) []string { return lines } +func fileContentShapeMarkdownLines(data map[string]any, expanded bool) []string { + selector, _ := data["selector"].(string) + fileCount := asInt(data["file_count"]) + dirCount := asInt(data["dir_count"]) + readable := asInt(data["readable_count"]) + unsupported := asInt(data["unsupported_count"]) + parseFailures := asInt(data["parse_failure_count"]) + coherence, _ := data["coherence"].(string) + lines := []string{sectionDivider, "selection:"} + lines = append(lines, alignRows([][]string{ + {"expression", selector}, + {"files", fmt.Sprintf("%d", fileCount)}, + {"directories", fmt.Sprintf("%d", dirCount)}, + {"readable", fmt.Sprintf("%d", readable)}, + {"unsupported", fmt.Sprintf("%d", unsupported)}, + {"parse failures", fmt.Sprintf("%d", parseFailures)}, + }, " ", ": ")...) + + exts := anyMap(data["extensions"]) + if len(exts) > 0 { + limit := 5 + if expanded { + limit = len(exts) + } + lines = appendSection(lines, "file types:") + lines = append(lines, histogramTableLines(exts, limit, "TYPE", "FILES")...) + if len(exts) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more extensions hidden; pass -v to show all", len(exts)-limit)) + } + } + + lines = appendSection(lines, "coherence:") + lines = append(lines, alignRows([][]string{{"status", coherence}}, " ", ": ")...) + + lines = appendContentBulletSection(lines, "common structure:", stringSlice(data["common_structure"]), expanded) + lines = appendContentBulletSection(lines, "variation:", stringSlice(data["variation"]), expanded) + + lines = appendSection(lines, "text:") + lines = append(lines, markdownShapeLines(anyMap(data["markdown"]), expanded)...) + lines = appendSection(lines, "tabular:") + lines = append(lines, csvShapeLines(anyMap(data["csv"]))...) + lines = appendSection(lines, "tree:") + lines = append(lines, jsonShapeLines(anyMap(data["json"]), expanded)...) + lines = appendSection(lines, "read/parse issues:") + lines = append(lines, issueLines(anySlice(data["issues"]), expanded)...) + return lines +} + +func appendContentBulletSection(lines []string, label string, items []string, expanded bool) []string { + lines = appendSection(lines, label) + if len(items) == 0 { + return append(lines, " none") + } + limit := len(items) + if !expanded && limit > 5 { + limit = 5 + } + for _, item := range items[:limit] { + lines = append(lines, fmt.Sprintf(" - %s", item)) + } + if len(items) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more item(s) hidden; pass -v to show all", len(items)-limit)) + } + return lines +} + +func markdownShapeLines(md map[string]any, expanded bool) []string { + files := asInt(md["files"]) + if files == 0 { + return []string{" no Markdown files selected"} + } + lines := alignRows([][]string{ + {"files", fmt.Sprintf("%d", files)}, + {"with H1", fmt.Sprintf("%d", asInt(md["h1"]))}, + }, " ", ": ") + keys := anyMap(md["frontmatter_keys"]) + if len(keys) > 0 { + lines = append(lines, " frontmatter keys:") + limit := 5 + if expanded { + limit = len(keys) + } + lines = append(lines, histogramTableLines(keys, limit, "KEY", "FILES")...) + if len(keys) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more keys hidden; pass -v to show all", len(keys)-limit)) + } + } + sections := anyMap(md["sections"]) + if len(sections) > 0 && expanded { + lines = append(lines, " sections:") + lines = append(lines, histogramTableLines(sections, len(sections), "SECTION", "FILES")...) + } + return lines +} + +func csvShapeLines(csv map[string]any) []string { + files := asInt(csv["files"]) + if files == 0 { + return []string{" no CSV files selected"} + } + stats := anyMap(csv["row_counts"]) + lines := alignRows([][]string{ + {"files", fmt.Sprintf("%d", files)}, + {"rows", fmt.Sprintf("%d-%d (median %d)", asInt(stats["min"]), asInt(stats["max"]), asInt(stats["median"]))}, + }, " ", ": ") + cols := anyMap(csv["columns"]) + if len(cols) > 0 { + lines = append(lines, " columns:") + lines = append(lines, histogramTableLines(cols, len(cols), "COLUMN", "FILES")...) + } + return lines +} + +func jsonShapeLines(js map[string]any, expanded bool) []string { + files := asInt(js["files"]) + if files == 0 { + return []string{" no JSON files selected"} + } + lines := alignRows([][]string{ + {"files", fmt.Sprintf("%d", files)}, + {"object files", fmt.Sprintf("%d", asInt(js["object_files"]))}, + }, " ", ": ") + shapes := anyMap(js["top_level_shapes"]) + if len(shapes) > 0 { + lines = append(lines, " top-level shapes:") + lines = append(lines, histogramTableLines(shapes, len(shapes), "SHAPE", "FILES")...) + } + keys := anyMap(js["common_object_keys"]) + if len(keys) > 0 { + lines = append(lines, " object keys:") + limit := 5 + if expanded { + limit = len(keys) + } + lines = append(lines, histogramTableLines(keys, limit, "KEY", "FILES")...) + if len(keys) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more keys hidden; pass -v to show all", len(keys)-limit)) + } + } + return lines +} + +func issueLines(issues []any, expanded bool) []string { + if len(issues) == 0 { + return []string{" none"} + } + limit := len(issues) + if !expanded && limit > 5 { + limit = 5 + } + rows := [][]string{{"KIND", "PATH", "DETAIL"}} + for _, item := range issues[:limit] { + m := anyMap(item) + kind, _ := m["kind"].(string) + path, _ := m["path"].(string) + detail, _ := m["detail"].(string) + rows = append(rows, []string{kind, path, detail}) + } + lines := alignTable(rows, " ") + if len(issues) > limit { + lines = append(lines, fmt.Sprintf(" ... %d more issue(s) hidden; pass -v to show all", len(issues)-limit)) + } + return lines +} + func appendSection(lines []string, label string) []string { return append(lines, "", sectionDivider, label) } diff --git a/internal/inspect/selection.go b/internal/inspect/selection.go new file mode 100644 index 0000000..00d279c --- /dev/null +++ b/internal/inspect/selection.go @@ -0,0 +1,95 @@ +package inspect + +import ( + "fmt" + "sort" + "strings" + + "github.com/bmatcuk/doublestar/v4" +) + +const ( + SelectionAll = "all" + SelectionDir = "dir" + SelectionGlob = "glob" + SelectionExt = "ext" + SelectionPathUnder = "path_under" +) + +// ParseSelection classifies the user-facing --select expression. It is +// path-only; content predicates belong to a later pass. +func ParseSelection(raw string) Selection { + label := strings.TrimSpace(raw) + if label == "" { + return Selection{Label: "all files", Mode: SelectionAll} + } + if ext, ok := parseQuotedPredicate(label, "ext = "); ok { + return Selection{Label: label, Mode: SelectionExt, Pattern: ext} + } + if prefix, ok := parseQuotedPredicate(label, "path under "); ok { + return Selection{Label: label, Mode: SelectionPathUnder, Pattern: cleanSelectionPath(prefix)} + } + if strings.ContainsAny(label, "*?[") { + return Selection{Label: label, Mode: SelectionGlob, Pattern: cleanSelectionPath(label)} + } + return Selection{Label: label, Mode: SelectionDir, Pattern: cleanSelectionPath(label)} +} + +func parseQuotedPredicate(s, prefix string) (string, bool) { + if !strings.HasPrefix(s, prefix) { + return "", false + } + rest := strings.TrimSpace(strings.TrimPrefix(s, prefix)) + if len(rest) >= 2 && rest[0] == '"' && rest[len(rest)-1] == '"' { + return rest[1 : len(rest)-1], true + } + return rest, true +} + +func cleanSelectionPath(s string) string { + s = strings.TrimSpace(strings.ReplaceAll(s, "\\", "/")) + s = strings.TrimPrefix(s, "./") + return strings.Trim(s, "/") +} + +func (v SourceView) selectFiles(sel Selection) ([]sourceFile, error) { + if sel.Mode == "" || sel.Mode == SelectionAll { + return sortedSourceFiles(v.files), nil + } + var out []sourceFile + for _, f := range v.files { + ok := false + switch sel.Mode { + case SelectionDir: + prefix := strings.TrimSuffix(sel.Pattern, "/") + ok = f.rel == prefix || strings.HasPrefix(f.rel, prefix+"/") + case SelectionGlob: + matched, err := doublestar.Match(sel.Pattern, f.rel) + if err != nil { + return nil, fmt.Errorf("select %q: %w", sel.Label, err) + } + ok = matched + case SelectionExt: + ok = f.ext == sel.Pattern + case SelectionPathUnder: + prefix := strings.TrimSuffix(sel.Pattern, "/") + ok = f.rel == prefix || strings.HasPrefix(f.rel, prefix+"/") + default: + return nil, fmt.Errorf("select %q: unknown selection mode %q", sel.Label, sel.Mode) + } + if ok { + out = append(out, f) + } + } + return sortedSourceFiles(out), nil +} + +func sortedSourceFiles(files []sourceFile) []sourceFile { + out := append([]sourceFile(nil), files...) + sortSourceFiles(out) + return out +} + +func sortSourceFiles(files []sourceFile) { + sort.Slice(files, func(i, j int) bool { return files[i].rel < files[j].rel }) +} diff --git a/internal/inspect/selection_test.go b/internal/inspect/selection_test.go new file mode 100644 index 0000000..645b285 --- /dev/null +++ b/internal/inspect/selection_test.go @@ -0,0 +1,26 @@ +package inspect_test + +import ( + "reflect" + "testing" + + "github.com/abegong/katalyst/internal/inspect" +) + +func TestParseSelection(t *testing.T) { + tests := []struct { + raw string + want inspect.Selection + }{ + {"", inspect.Selection{Label: "all files", Mode: inspect.SelectionAll}}, + {"content/books/*.md", inspect.Selection{Label: "content/books/*.md", Mode: inspect.SelectionGlob, Pattern: "content/books/*.md"}}, + {`ext = ".csv"`, inspect.Selection{Label: `ext = ".csv"`, Mode: inspect.SelectionExt, Pattern: ".csv"}}, + {`path under "docs/reference"`, inspect.Selection{Label: `path under "docs/reference"`, Mode: inspect.SelectionPathUnder, Pattern: "docs/reference"}}, + {"raw/notes", inspect.Selection{Label: "raw/notes", Mode: inspect.SelectionDir, Pattern: "raw/notes"}}, + } + for _, tt := range tests { + if got := inspect.ParseSelection(tt.raw); !reflect.DeepEqual(got, tt.want) { + t.Errorf("ParseSelection(%q) = %#v, want %#v", tt.raw, got, tt.want) + } + } +} diff --git a/internal/inspect/source.go b/internal/inspect/source.go index 817b947..2ba5d9a 100644 --- a/internal/inspect/source.go +++ b/internal/inspect/source.go @@ -93,6 +93,13 @@ func (v SourceView) N() int { return len(v.files) } // nothing. func (v SourceView) ParseCount() int { return v.md.count } +// readFile opens one discovered file by relative path and records the read as +// content inspection work. +func (v SourceView) readFile(rel string) ([]byte, error) { + v.md.count++ + return os.ReadFile(filepath.Join(v.root, filepath.FromSlash(rel))) +} + // refsByDir groups every file's relative path by its directory. func (v SourceView) refsByDir() map[string][]string { out := map[string][]string{} diff --git a/internal/inspect/source_test.go b/internal/inspect/source_test.go index 4ffc3df..af94387 100644 --- a/internal/inspect/source_test.go +++ b/internal/inspect/source_test.go @@ -59,17 +59,28 @@ func TestFileTree_opensNothingAndReportsFilesystemMap(t *testing.T) { } } -func TestFileTreeContent_parsesMarkdown(t *testing.T) { +func TestFileContentShape_profilesSelectedMarkdown(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "notes/dune.md", "---\ntitle: Dune\n---\n# Dune\n") + writeFile(t, dir, "data/books.csv", "title,rating\nDune,5\n") view, err := inspect.NewSourceView(dir) if err != nil { t.Fatalf("NewSourceView: %v", err) } - _ = inspect.FileTreeContent{}.Inspect(view, inspect.Params{}) + ev := inspect.FileContentShape{}.Inspect(view, inspect.Params{}.WithSelection(inspect.ParseSelection(`ext = ".md"`))) if view.ParseCount() == 0 { - t.Error("file_tree_content should parse markdown (ParseCount > 0)") + t.Error("file_content_shape should open selected files (ParseCount > 0)") + } + if ev.Inspector != "file_content_shape" { + t.Errorf("inspector = %q, want file_content_shape", ev.Inspector) + } + if got := ev.Data["file_count"].(int); got != 1 { + t.Errorf("file_count = %d, want selected markdown file only", got) + } + md := ev.Data["markdown"].(map[string]any) + if got := md["files"].(int); got != 1 { + t.Errorf("markdown.files = %d, want 1", got) } } diff --git a/product/specs/file-content-shape-inspector-plan.md b/product/specs/file-content-shape-inspector-plan.md new file mode 100644 index 0000000..2d523b2 --- /dev/null +++ b/product/specs/file-content-shape-inspector-plan.md @@ -0,0 +1,206 @@ +# Plan - file content shape inspector + +> Spec: [File content shape inspector](./file-content-shape-inspector-spec.md) +> **Status: executed.** + +## Current State + +- `katalyst inspect ` infers one layer from the target. A + configured collection runs collection inspectors; a filesystem directory runs + source inspectors. `--inspector` already narrows by registry name. +- `internal/inspect/params.go` carries source and collection inspector + parameters for summarizer detail. Inspectors that do not need a parameter + ignore it. +- `internal/inspect/source.go` walks non-hidden filesystem paths into + `SourceView.files`, and lazily parses all `.md` files through + `SourceView.markdown`. +- `file_tree` now owns the path-only store map. The old content inspector parsed + Markdown per directory and rendered generic `classes` / `outliers`. +- `document_shape` still clusters Markdown files by composite fingerprint. The + new spec keeps clustering as a follow-up path, not the primary content-shape + workflow. +- `internal/inspect/render.go` has a custom Markdown renderer for `file_tree`; + all other inspectors use generic key/value rendering. + +## Sequencing + +| Phase | Focus | Scope | +|---|---|---| +| 1 | CLI and selection contracts | `--select` flag, params plumbing, usage errors, selection resolver tests | +| 2 | Content-shape model | selected file reads, Markdown/CSV/JSON parsers, evidence payload | +| 3 | Registry transition | replace `file_tree_content` with `file_content_shape`, docs-generation snapshots | +| 4 | Markdown rendering | inspector-specific report, default caps, verbose expansion | +| 5 | Docs and verification | CLI/reference/deep-dive updates, full test suite | + +The order keeps the user-facing contract stable first: `inspect` remains one +command, `file_content_shape` remains a normal source inspector, and `--select` +is scoped to that inspector before any parser behavior depends on it. + +## Phases + +### Phase 1 - CLI and selection contracts + +**Goal:** `--select` reaches source inspectors through `inspect.Params`, and +invalid combinations fail before files are opened. + +1. **File:** `internal/inspect/params.go`. + Add a `Selection` value on `Params`, with `Label`, `Mode`, and `Pattern`. + Extend `ParseParams` or add a small wrapper so existing summarizer tests stay + focused and selection validation remains in `cmd/inspect.go`. +2. **File:** `cmd/inspect.go`. + Add `--select string`. It is valid only when the target resolves to the + source layer, exactly one `--inspector` is supplied, and that inspector is + `file_content_shape`. Passing `--select` with a collection target, no + inspector, multiple inspectors, or another inspector returns a usage error. +3. **File:** `internal/inspect/selection.go` (new). + Resolve selection after the `SourceView` walk and before content reads. + Support: + - default all files when no selection is set + - directory prefix (`content/books/`) + - doublestar-style glob or `path.Match`-compatible glob + - path query `ext = ".csv"` + - path query `path under "docs/reference"` +4. **File:** `internal/inspect/source.go`. + Add small helpers for selected files and reading a relative file path. Keep + the selection path-derived; no content predicate support. +5. **Tests:** `cmd/inspect_test.go`, `internal/inspect/params_test.go`, and a + new `internal/inspect/selection_test.go`. + Pin valid and invalid `--select` combinations plus deterministic selected + path ordering. + +### Phase 2 - Content-shape model + +**Goal:** selected files produce complete JSON evidence for Markdown, CSV, JSON, +and unsupported/read-failure cases. + +1. **File:** `internal/inspect/filecontentshape.go` (new). + Add `FileContentShape.Inspect`, a typed summary builder, and map conversion + for JSON evidence. +2. **File:** `internal/inspect/filecontentshape.go`. + Compute selection summary: selector label, selected file count, directory + count, extension histogram, readable count, unsupported count, parse-failure + count, and skipped paths. +3. **File:** `internal/inspect/filecontentshape.go`. + Markdown parser: use the existing document parser to produce text and tree + view facets: frontmatter key frequencies, H1 count, H2+ section frequencies, + and parse issues. +4. **File:** `internal/inspect/filecontentshape.go`. + CSV parser: use `encoding/csv` to report common column names, optional + columns, row-count min/median/max, and parse issues. +5. **File:** `internal/inspect/filecontentshape.go`. + JSON parser: use `encoding/json` to report top-level shape frequencies and + common keys for top-level objects. +6. **File:** `internal/inspect/filecontentshape.go`. + Add a small coherence classifier (`coherent`, `partly_coherent`, `mixed`) + based on high-frequency view facts. Keep it descriptive and count-backed. +7. **Tests:** `internal/inspect/filecontentshape_test.go`. + Cover coherent Markdown, coherent CSV, partly coherent JSON, broad mixed + selection, unsupported files, and parse/read failures. + +### Phase 3 - Registry transition + +**Goal:** `file_content_shape` is the public source inspector name; the old +Markdown-only inspector is removed from the default public registry. + +1. **File:** `internal/inspect/inspectors_source.go`. + Replace `FileTreeContent` in `SourceInspectors()` with + `FileContentShape`. Remove old per-directory Markdown clustering helpers if + no production code or tests still need them. +2. **File:** `internal/inspect/registry.go`. + Replace the descriptor `file_tree_content` with `file_content_shape`, + updating slug, title, family, and summary. +3. **Files:** `cmd/testdata/snapshots/inspectors/*`, + `docs/content/reference/inspectors/source/*`. + Regenerate inspector reference docs with `make docs-gen`; update snapshots + for list/show output. +4. **Tests:** `internal/inspect/registry_test.go` and `cmd/inspectors_test.go`. + Keep registry parity green and assert `file_content_shape` appears in source + inspector listings. + +### Phase 4 - Markdown rendering + +**Goal:** default Markdown reads as a short content-shape report while JSON stays +complete. + +1. **File:** `internal/inspect/render.go`. + Add a `file_content_shape` renderer branch before generic `dataLines`. +2. **File:** `internal/inspect/render.go`. + Render default Markdown sections: + - selector and selection summary + - coherence statement + - common structure + - variation + - text, tree, and tabular summaries when present + - read/parse issues +3. **File:** `internal/inspect/render.go`. + Reuse the quiet inspect-report styling from `file_tree`: section dividers, + lowercase labels, aligned key/value rows, and tabular headers. +4. **File:** `internal/inspect/render.go`. + Treat `maxLines <= 0` as expanded output. Verbose output includes more + examples, per-extension/per-directory breakdowns, and full frequency tables. +5. **Tests:** `internal/inspect/render_test.go` and + `cmd/testdata/snapshots/inspect/source-report.txt`. + Pin Markdown for Markdown/CSV/JSON examples and ensure generic rendering + still handles other inspectors. + +### Phase 5 - Docs and verification + +**Goal:** documentation describes the two-step raw-source flow and the focused +suite verifies behavior. + +1. **File:** `docs/content/deep-dives/inspectors.md`. + Describe raw-source inspection as store map (`file_tree`) plus selected + content shape (`file_content_shape`). Note clustering/suggestions as future + work. +2. **File:** `internal/inspect/doc.go`. + Align package-level wording if it names Markdown-only or clustering-specific + behavior. +3. **File:** `docs/content/reference/cli.md`. + Document `--select`, its valid pairing with + `--inspector file_content_shape`, and supported selection syntax. +4. **File:** `docs/content/reference/glossary.md`. + Add `content view` only if the implementation keeps that term visible. +5. **Validation:** run `go test ./internal/inspect ./cmd`, `make docs-gen`, and + `go test ./...`. + +## Key Files + +| File | Role | +|---|---| +| `cmd/inspect.go` | adds and validates `--select`; passes selection through params | +| `internal/inspect/params.go` | carries selection alongside summarizer parameters | +| `internal/inspect/selection.go` | path-derived selection resolver | +| `internal/inspect/source.go` | selected file helpers and relative file reads | +| `internal/inspect/filecontentshape.go` | summary builder and Markdown/CSV/JSON parsers | +| `internal/inspect/registry.go` | public inspector descriptor transition | +| `internal/inspect/render.go` | `file_content_shape` Markdown projection | +| `cmd/testdata/snapshots/inspect/` | CLI output contracts | +| `docs/content/deep-dives/inspectors.md` | raw-source model docs | +| `docs/content/reference/cli.md` | `--select` reference | + +## Architecture Decisions + +| Decision | Choice | Rationale | +|---|---|---| +| Command shape | regular source inspector plus `--select` | preserves the existing inspect grammar and registry flow | +| Selection scope | valid only for `file_content_shape` first cut | avoids implicit subset semantics for unrelated inspectors | +| Public name | replace `file_tree_content` with `file_content_shape` | avoids two names and keeps reports canonical | +| Parser scope | Markdown, CSV, JSON only | enough to prove text/tabular/tree views without dependency drift | +| Selection timing | path-only before content reads | deterministic, cheap, and matches the spec boundary | +| Output model | complete JSON, capped Markdown | machines get full evidence; humans get a short report | +| Clustering | deferred | explicit selections are the primary workflow for now | + +## Documentation updates + +- `docs/content/deep-dives/inspectors.md`: store map plus content shape model. +- `docs/content/reference/inspectors/`: regenerate after descriptor rename. +- `docs/content/reference/cli.md`: `--select` syntax and constraints. +- `docs/content/reference/glossary.md`: add only surviving user-facing terms. + +## Out of Scope + +- HTML, XML, YAML, TOML, code AST, Markdown table, or JSON array-to-table + parsers. +- Content predicates in selection syntax. +- Automatic selection suggestions or clustering. +- Alias compatibility for `file_tree_content`; add deliberately later if needed. From 597c3a90e8b5e65eabfcde2e923b7af34fa98a0e Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Thu, 25 Jun 2026 06:47:12 -0600 Subject: [PATCH 09/10] Retire document shape inspector --- cmd/inspect.go | 14 +- cmd/inspect_test.go | 42 +++--- cmd/inspectors_test.go | 8 +- cmd/testdata/snapshots/help/inspect.txt | 3 - .../snapshots/inspect/source-report.txt | 8 -- cmd/testdata/snapshots/inspectors/list.txt | 4 +- .../inspectors/show-document_shape.txt | 15 --- .../inspectors/show-file_content_shape.txt | 14 ++ docs/content/deep-dives/inspectors.md | 28 ++-- .../profile-an-existing-wiki-by-hand.md | 18 +-- .../profile-an-existing-wiki-with-an-agent.md | 21 ++- docs/content/reference/glossary.md | 4 +- docs/content/reference/inspectors/_index.md | 1 - .../reference/inspectors/source/_index.md | 1 - .../inspectors/source/document-shape.md | 30 ----- .../examples/inspect-source-shape.full.md | 67 ++++++++-- .../examples/inspect-source-shape.txt | 59 ++++++++- internal/examples/examples.go | 8 +- .../examples/testdata/inspect-source-shape.md | 67 ++++++++-- internal/inspect/inspectors_source.go | 75 +---------- internal/inspect/params.go | 71 +---------- internal/inspect/params_test.go | 49 ------- internal/inspect/registry.go | 9 -- internal/inspect/render_test.go | 24 +++- internal/inspect/source.go | 63 ++------- internal/inspect/source_test.go | 31 +---- internal/inspect/summarize.go | 120 ------------------ internal/inspect/summarize_test.go | 72 ----------- .../file-content-shape-inspector-plan.md | 21 ++- .../file-content-shape-inspector-spec.md | 44 +++---- 30 files changed, 305 insertions(+), 686 deletions(-) delete mode 100644 cmd/testdata/snapshots/inspectors/show-document_shape.txt create mode 100644 cmd/testdata/snapshots/inspectors/show-file_content_shape.txt delete mode 100644 docs/content/reference/inspectors/source/document-shape.md delete mode 100644 internal/inspect/params_test.go delete mode 100644 internal/inspect/summarize.go delete mode 100644 internal/inspect/summarize_test.go diff --git a/cmd/inspect.go b/cmd/inspect.go index 2dda334..a8eb68a 100644 --- a/cmd/inspect.go +++ b/cmd/inspect.go @@ -16,9 +16,6 @@ func newInspectCmd() *cobra.Command { inspectors []string maxLines int verbose bool - detail string - similarity float64 - maxClasses int selectExpr string ) @@ -37,10 +34,7 @@ Inspectors describe; they never recommend. inspect writes no schema and mutates nothing. Output is Markdown by default; --json emits the same evidence as JSON.`, Args: exactArgs(1, "inspect "), RunE: func(cmd *cobra.Command, args []string) error { - params, err := inspect.ParseParams(detail, similarity, maxClasses) - if err != nil { - return usageErr(err.Error()) - } + params := inspect.Params{} if selectExpr != "" { if len(inspectors) != 1 || inspectors[0] != "file_content_shape" { return usageErr("--select requires exactly one source inspector: --inspector file_content_shape") @@ -81,12 +75,6 @@ nothing. Output is Markdown by default; --json emits the same evidence as JSON.` "Truncate each inspector's Markdown output to N lines (0 = no limit).") c.Flags().BoolVarP(&verbose, "verbose", "v", false, "Show full output; do not truncate (same as --max-lines 0).") - c.Flags().StringVar(&detail, "detail", "", - "Summarizer detail level: exact, grouped, or coarse (default grouped).") - c.Flags().Float64Var(&similarity, "similarity", -1, - "Summarizer similarity threshold (0–1). Mutually exclusive with --detail/--max-classes.") - c.Flags().IntVar(&maxClasses, "max-classes", 0, - "Cap the number of summarized classes. Mutually exclusive with --detail/--similarity.") c.Flags().StringVar(&selectExpr, "select", "", "Select files for file_content_shape: directory, glob, ext = \".csv\", or path under \"docs\".") return c diff --git a/cmd/inspect_test.go b/cmd/inspect_test.go index 1cfa00e..772262e 100644 --- a/cmd/inspect_test.go +++ b/cmd/inspect_test.go @@ -113,7 +113,7 @@ func TestInspect_outputFileMatchesStdout(t *testing.T) { func TestInspect_inspectorFlagNarrows(t *testing.T) { dir := inspectRepo(t) - stdout, _, err := runRoot(t, "inspect", "--json", "--inspector", "document_shape", dir) + stdout, _, err := runRoot(t, "inspect", "--json", "--inspector", "file_tree", dir) if err != nil { t.Fatalf("inspect --inspector: %v", err) } @@ -121,8 +121,8 @@ func TestInspect_inspectorFlagNarrows(t *testing.T) { if err := json.Unmarshal([]byte(stdout), &records); err != nil { t.Fatalf("bad json: %v", err) } - if len(records) != 1 || records[0]["inspector"] != "document_shape" { - t.Errorf("expected only document_shape, got %v", records) + if len(records) != 1 || records[0]["inspector"] != "file_tree" { + t.Errorf("expected only file_tree, got %v", records) } } @@ -154,7 +154,7 @@ func TestInspect_selectRejectsInvalidCombinations(t *testing.T) { tests := [][]string{ {"inspect", "--select", "books", dir}, {"inspect", "--inspector", "file_tree", "--select", "books", dir}, - {"inspect", "--inspector", "file_content_shape", "--inspector", "document_shape", "--select", "books", dir}, + {"inspect", "--inspector", "file_content_shape", "--inspector", "file_tree", "--select", "books", dir}, } for _, args := range tests { _, _, err := runRoot(t, args...) @@ -218,35 +218,33 @@ func TestInspect_unknownInspectorIsUsageError(t *testing.T) { } } -func TestInspect_collapseParamsMutuallyExclusive(t *testing.T) { - dir := inspectRepo(t) - _, _, err := runRoot(t, "inspect", "--detail", "coarse", "--max-classes", "2", dir) - var coded interface{ Code() int } - if err == nil || !errors.As(err, &coded) || coded.Code() != 2 { - t.Errorf("expected exit 2 for mutually-exclusive collapse flags, got: %v", err) - } -} - func TestInspect_outputIncludesDescriptions(t *testing.T) { stdout, _, err := runRoot(t, "inspect", inspectRepo(t)) if err != nil { t.Fatalf("inspect: %v", err) } - if !strings.Contains(stdout, "Cluster files into candidate collections") { + if !strings.Contains(stdout, "Profile selected files by text") { t.Errorf("output missing inspector description\n%s", stdout) } } func TestInspect_truncatesLongOutputAndVerboseShowsAll(t *testing.T) { dir := t.TempDir() - // Ten files with disjoint frontmatter keys + sections → ten singleton - // document_shape classes, enough lines to exceed a small --max-lines. + writeFile(t, dir, ".katalyst/storage/local.yaml", `type: filesystem +root: . +collections: + notes: + path: notes + checks: + - kind: markdown_requires_h1 +`) for i := 0; i < 10; i++ { - writeFile(t, dir, fmt.Sprintf("docs/f%02d.md", i), - fmt.Sprintf("---\nk%02d: v\n---\n# H\n\n## S%02d\n", i, i)) + writeFile(t, dir, fmt.Sprintf("notes/f%02d.md", i), + fmt.Sprintf("---\nk%02d: v\n---\n# H\n", i)) } + chdir(t, dir) - truncated, _, err := runRoot(t, "inspect", "--inspector", "document_shape", "--max-lines", "5", dir) + truncated, _, err := runRoot(t, "inspect", "--inspector", "object_fields", "--max-lines", "5", "notes") if err != nil { t.Fatalf("inspect --max-lines: %v", err) } @@ -254,15 +252,15 @@ func TestInspect_truncatesLongOutputAndVerboseShowsAll(t *testing.T) { t.Errorf("expected a truncation notice with --max-lines 5\n%s", truncated) } - full, _, err := runRoot(t, "inspect", "--inspector", "document_shape", "-v", dir) + full, _, err := runRoot(t, "inspect", "--inspector", "object_fields", "-v", "notes") if err != nil { t.Fatalf("inspect -v: %v", err) } if strings.Contains(full, "truncated") { t.Errorf("-v should not truncate\n%s", full) } - if got := strings.Count(full, "label=docs/f"); got != 10 { - t.Errorf("-v rendered %d outliers, want 10\n%s", got, full) + if !strings.Contains(full, "k09") { + t.Errorf("-v should render all object field evidence\n%s", full) } } diff --git a/cmd/inspectors_test.go b/cmd/inspectors_test.go index 88a6b10..68885a2 100644 --- a/cmd/inspectors_test.go +++ b/cmd/inspectors_test.go @@ -55,15 +55,15 @@ func TestInspectorsShow_showsDetail(t *testing.T) { snapshot(t, "inspectors/show-object_fields.txt", stdout) } -func TestInspectorsShow_showsLayerContextAndSiblings(t *testing.T) { +func TestInspectorsShow_showsSourceLayerContextAndSiblings(t *testing.T) { chdir(t, t.TempDir()) - stdout, _, err := runRoot(t, "inspectors", "show", "document_shape") + stdout, _, err := runRoot(t, "inspectors", "show", "file_content_shape") if err != nil { - t.Fatalf("inspectors show document_shape: %v", err) + t.Fatalf("inspectors show file_content_shape: %v", err) } // The fixture pins the breadcrumb header, the layer intro, and the sibling // list. - snapshot(t, "inspectors/show-document_shape.txt", stdout) + snapshot(t, "inspectors/show-file_content_shape.txt", stdout) } func TestInspectorsShow_unknown_exit2(t *testing.T) { diff --git a/cmd/testdata/snapshots/help/inspect.txt b/cmd/testdata/snapshots/help/inspect.txt index 1ed175d..0b5f7b2 100644 --- a/cmd/testdata/snapshots/help/inspect.txt +++ b/cmd/testdata/snapshots/help/inspect.txt @@ -13,13 +13,10 @@ Usage: katalyst inspect [flags] Flags: - --detail string Summarizer detail level: exact, grouped, or coarse (default grouped). -h, --help help for inspect --inspector stringArray Run only the named inspector(s); repeatable. Default: all in the selected layer. --json Emit evidence as JSON instead of Markdown. - --max-classes int Cap the number of summarized classes. Mutually exclusive with --detail/--similarity. --max-lines int Truncate each inspector's Markdown output to N lines (0 = no limit). (default 20) -o, --output string Write the report to a file instead of stdout. --select string Select files for file_content_shape: directory, glob, ext = ".csv", or path under "docs". - --similarity float Summarizer similarity threshold (0–1). Mutually exclusive with --detail/--max-classes. (default -1) -v, --verbose Show full output; do not truncate (same as --max-lines 0). diff --git a/cmd/testdata/snapshots/inspect/source-report.txt b/cmd/testdata/snapshots/inspect/source-report.txt index 1092652..7eeb8a8 100644 --- a/cmd/testdata/snapshots/inspect/source-report.txt +++ b/cmd/testdata/snapshots/inspect/source-report.txt @@ -56,14 +56,6 @@ tree: read/parse issues: none -### document_shape (n=2) - -_Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming._ - -- classes: - - class=P1 features=[ext:.md, casing:kebab, fmkey:status, fmkey:title, sec:Review] members=[books/dune.md, books/it.md] size=2 -- outliers: - ## Filesystem ### file_tree (n=2) diff --git a/cmd/testdata/snapshots/inspectors/list.txt b/cmd/testdata/snapshots/inspectors/list.txt index daf6b3c..e96a3c3 100644 --- a/cmd/testdata/snapshots/inspectors/list.txt +++ b/cmd/testdata/snapshots/inspectors/list.txt @@ -1,11 +1,9 @@ -Raw-source inspectors (3) +Raw-source inspectors (2) ------------------------- - file_tree Map files, directories, extensions, regions, and filename conventions, opening no files. - file_content_shape Profile selected files by text, tabular, and tree content structure. -- document_shape - Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. Collection inspectors (2) ------------------------- diff --git a/cmd/testdata/snapshots/inspectors/show-document_shape.txt b/cmd/testdata/snapshots/inspectors/show-document_shape.txt deleted file mode 100644 index 70cc184..0000000 --- a/cmd/testdata/snapshots/inspectors/show-document_shape.txt +++ /dev/null @@ -1,15 +0,0 @@ -Raw-source inspectors › Document shape ----------------------------------------- -- inspector: document_shape -- layer: source -- family: structural -- purpose: Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. - -Layer context -------------- -Raw-source inspectors profile a backend store directly, before any collection configuration: what files are present, how they parse, and how they are named. - -Other raw-source inspectors (2) -------------------------------- -- file_tree -- file_content_shape diff --git a/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt b/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt new file mode 100644 index 0000000..50a3a43 --- /dev/null +++ b/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt @@ -0,0 +1,14 @@ +Raw-source inspectors › File content shape +------------------------------------------- +- inspector: file_content_shape +- layer: source +- family: structural +- purpose: Profile selected files by text, tabular, and tree content structure. + +Layer context +------------- +Raw-source inspectors profile a backend store directly, before any collection configuration: what files are present, how they parse, and how they are named. + +Other raw-source inspectors (1) +------------------------------- +- file_tree diff --git a/docs/content/deep-dives/inspectors.md b/docs/content/deep-dives/inspectors.md index d1f8d4b..58bbca4 100644 --- a/docs/content/deep-dives/inspectors.md +++ b/docs/content/deep-dives/inspectors.md @@ -21,8 +21,8 @@ Inspectors come in two layers, distinguished by *how they reference the data*: - **The raw-source layer** (`SourceInspector` over a `SourceView`) measures a backend store directly, before any collection configuration, addressed by backend-native reference (a relative path today). It answers "what is in this - store?" - the onboarding case. `file_tree`, `file_content_shape`, and - `document_shape` live here. + store?" - the onboarding case. `file_tree` and `file_content_shape` live + here. - **The collection layer** (`CollectionInspector` over a `CollectionView`) measures a configured collection's items, addressed by domain identity (collection + item id) and reached through the project's @@ -48,9 +48,9 @@ inspectors themselves are thin wrappers that point a primitive at an input: shape (types, naming, depth, regions, directory density) over references, opening no files. -The same `objectFields` primitive runs over a collection's items (collection -layer) and over loose-file frontmatter (the `document_shape` fingerprint, raw -layer), so the two layers share one engine rather than re-deriving it. +The same small primitives are reused where the layer makes sense, but raw-source +inspectors avoid proposing collections. They report store and content facts; a +human or agent decides what collection boundaries those facts imply. ## Evidence, not recommendations @@ -69,25 +69,17 @@ why a conclusion holds and decides. Deterministic measurement is an inspector's job; threshold-picking and structure-proposing are not. Counting field presence, histogramming types, -mapping filesystem regions, and clustering files by a composite fingerprint are -all deterministic, all inspectors. Deciding that 94% is "required", that two -near-but-distinct clusters are one collection, or what to name a schema are all -judgment, none of it here. -`document_shape` sits on the seam: it groups files with matching fingerprints -(deterministic) but leaves the fuzzy "these two classes are the same collection" -call to the reader. +mapping filesystem regions, and summarizing selected-file content structure are +all deterministic, all inspectors. Deciding that 94% is "required", that a +directory should be a collection, or what to name a schema are all judgment, +none of it here. ## Keeping output small `file_tree` and `file_content_shape` keep Markdown output small with deterministic caps: small trees get an actual tree; content-shape reports show the selected file set, dominant structures, and compact text/tabular/tree -facets, with `-v` for expanded evidence. Clustering inspectors such as -`document_shape` still collapse near-identical profiles into named classes, so -output is proportional to the number of *distinct* profiles rather than the -number of files. The collapse tolerance is the first inspector parameter, in -three mutually-exclusive forms: a named detail level, a similarity proportion, -or a max-classes budget. +facets, with `-v` for expanded evidence. ## Output diff --git a/docs/content/how-to/profile-an-existing-wiki-by-hand.md b/docs/content/how-to/profile-an-existing-wiki-by-hand.md index e3af96a..033a7b8 100644 --- a/docs/content/how-to/profile-an-existing-wiki-by-hand.md +++ b/docs/content/how-to/profile-an-existing-wiki-by-hand.md @@ -27,15 +27,16 @@ raw-source inspectors: katalyst inspect ./wiki ``` -`document_shape` clusters files into **candidate collections** by a composite -fingerprint (frontmatter keys, body section skeleton, and file naming) so you -can see what natural groups exist: +`file_tree` reports the file types and naming conventions per directory. Use it +to decide which directory or prefix you want to inspect more closely. Then run +`file_content_shape` over that explicit slice: {{< katalyst-example "inspect-source-shape" >}} -`file_tree` reports the file types and naming conventions per directory. Use -this layer to decide **which directories are collections**: here the files -share one shape, so `./wiki` is a single `books` collection with one outlier. +This layer reports store and content facts, not candidate collections. Here the +Markdown files share enough structure that you can reasonably treat `./wiki` as +a single `books` collection and keep the file with the missing `author` in mind +as cleanup work. ## 2. Configure the collection @@ -83,10 +84,11 @@ judgment, not the tool's: | `markdown_body` heading shape | single-H1, H1-matches-title | `markdown_single_h1`, `markdown_title_matches_h1` | | `markdown_body` sections | recurring section headings | a `markdown_required_section` | | `file_tree` naming (step 1) | casing, spaces, extensions | `filesystem_name_case` (`style: kebab`), `filesystem_path_charset` (`deny: [" "]`) | +| `file_content_shape` common structure (step 1) | shared frontmatter keys and sections in the selected slice | confidence that the slice is coherent enough to configure as one collection | The denominator `n` is always reported, so you decide what "nearly every item" -means. The one item missing `author`, which is also the `document_shape` -outlier with spaces in its name, is exactly the kind of file a schema will flag. +means. The one item missing `author`, which also has spaces in its name, is +exactly the kind of file a schema will flag. ## 5. Draft a schema and check diff --git a/docs/content/how-to/profile-an-existing-wiki-with-an-agent.md b/docs/content/how-to/profile-an-existing-wiki-with-an-agent.md index 67d5b8d..65be479 100644 --- a/docs/content/how-to/profile-an-existing-wiki-with-an-agent.md +++ b/docs/content/how-to/profile-an-existing-wiki-with-an-agent.md @@ -8,12 +8,12 @@ weight = 6 The [by-hand guide]({{< relref "profile-an-existing-wiki-by-hand.md" >}}) has you read inspector evidence and decide the schema. This guide hands that judgment to an agent: `inspect` supplies the measurements, the agent supplies -the thresholds, the clustering, and the draft. Katalyst is the instrument; the -agent is the profiler. +the thresholds, collection-boundary decisions, and the draft. Katalyst is the +instrument; the agent is the profiler. The split is deliberate. Inspectors are deterministic and never recommend; -deciding that a field present in 94% of files should be `required`, or that two -similar directories are one collection, is the agent's call. Keep that division +deciding that a field present in 94% of files should be `required`, or that a +directory should be a collection, is the agent's call. Keep that division and the loop stays debuggable. ## 1. Give the agent the raw-store evidence @@ -26,9 +26,8 @@ denominator: katalyst inspect ./wiki --json ``` -With no project this runs the **raw-source** layer. The key record is -`document_shape`, which clusters files into candidate collections by a composite -fingerprint (frontmatter keys, body section skeleton, file naming). Feed the +With no project this runs the **raw-source** layer: `file_tree` maps the store +and `file_content_shape` summarizes selected-file content structure. Feed the output to the agent. Tell it the contract: every record is *evidence*, not a recommendation; it must choose its own thresholds and justify them. @@ -36,10 +35,10 @@ recommendation; it must choose its own thresholds and justify them. A capable agent then: -1. **Clusters** the `document_shape` classes into candidate collections. - `inspect` groups files with *matching* fingerprints; the agent decides when - two near-but-distinct classes are really one collection, and names them. It - drafts `.katalyst/storage/*` pointing each collection at its directory. +1. **Chooses collection boundaries** from the raw-source evidence. `file_tree` + shows the directory and naming map; `file_content_shape` shows whether an + explicit slice shares frontmatter and body conventions. The agent names the + collection and drafts `.katalyst/storage/*` pointing it at the chosen path. 2. **Profiles the fields** by inspecting each new collection, `katalyst inspect --json` runs the collection layer, whose `object_fields` record is the per-field data dictionary (presence, types, values). diff --git a/docs/content/reference/glossary.md b/docs/content/reference/glossary.md index 729e7eb..92f0b9a 100644 --- a/docs/content/reference/glossary.md +++ b/docs/content/reference/glossary.md @@ -29,7 +29,6 @@ how each term maps onto today's code is documented in the per-package | **Document** | The markdown file-form of an **Item**: a parsed markdown file (frontmatter metadata + body + a line map). Use it where parsing or the on-disk file is the subject; elsewhere prefer **Item**. | | **Evidence** | The structured result of one inspector: counts and distributions with the unit count `n` as denominator. Never a recommendation or verdict. | | **Field** | A key in an item's structured object (its frontmatter map). A field is an **Attribute**; a filename is an attribute but not a field. The term used wherever object or frontmatter keys are meant (`object_field_type`, `name_matches_field`). | -| **Fingerprint** | A file's composite signature (frontmatter keys, body section skeleton, and file type/naming) that `document_shape` clusters into candidate collections. | | **Frontmatter** | The on-disk metadata block at the top of a markdown file, in YAML (`---`), TOML (`+++`), or JSON (`{ … }`). | | **Granularity** | The level, item vs. collection, at which a StorageType attaches a store's units to the domain model (a markdown file is an item; a SQL table is a collection). | | **Inspector** | A read-only operation that measures content and returns evidence. The descriptive dual of a check: a check asserts a predicate, an inspector reports the distribution. Inspectors come in two layers. | @@ -37,8 +36,7 @@ how each term maps onto today's code is documented in the per-package | **Measurement primitive** | A reusable building block the inspectors are built from: `object_fields` (a data dictionary over object maps), `markdown_body` (body structure), and file-metadata. | | **Metadata** | The parsed, in-memory structure of the frontmatter (a `map[string]any`). | | **Operation** | Something a storage backend lets you do with its data: read, list, query, aggregate, write. Each has a scope (item, collection, across collections) and structural requirements the backend must satisfy. See [progressive operations]({{< relref "../deep-dives/progressive-operations.md" >}}). | -| **Profile class** | A group of near-identical profiles the summarizer collapses together, so output is proportional to the number of distinct profiles, not directories. | -| **Project** | The whole katalyst workspace: a repo root with a `.katalyst/` **Config** that declares the storage instances, collections, and checks katalyst operates over. The top-level scope an empty selector addresses, and what `katalyst init` creates. Collections live within a project; the `project` package (`internal/project`) is its code home, holding the `.katalyst/` loader while the `collection` layer lives under `storage/`. | +| **Project** | The whole katalyst workspace: a repo root with a `.katalyst/` **Config** that declares the storage instances, collections, and checks katalyst operates over. The top-level scope an empty selector addresses, and what `katalyst init` creates. Collections (and the query operations scoped to them) live within a project; the `project` package (`internal/project`) is its code home — it holds the `.katalyst/` loader, and the `collection` layer lives under `storage/`. | | **Raw-source layer** | Inspectors that profile a backend store directly, before any collection configuration, addressed by backend-native reference (a path today). The onboarding case: "what's in this store?" | | **Repo root** | The directory containing the `.katalyst/` config directory; the base for all path resolution. | | **Resolver** | The runtime object that decides which object schema applies to an item and caches compiled schemas per `(library, path)`. | diff --git a/docs/content/reference/inspectors/_index.md b/docs/content/reference/inspectors/_index.md index d18e490..ba4b339 100644 --- a/docs/content/reference/inspectors/_index.md +++ b/docs/content/reference/inspectors/_index.md @@ -16,7 +16,6 @@ Raw-source inspectors profile a backend store directly, before any collection co - [File tree]({{< relref "source/file-tree.md" >}}): Map files, directories, extensions, regions, and filename conventions, opening no files. - [File content shape]({{< relref "source/file-content-shape.md" >}}): Profile selected files by text, tabular, and tree content structure. -- [Document shape]({{< relref "source/document-shape.md" >}}): Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. ## Collection inspectors diff --git a/docs/content/reference/inspectors/source/_index.md b/docs/content/reference/inspectors/source/_index.md index 51bd3e0..5c9e706 100644 --- a/docs/content/reference/inspectors/source/_index.md +++ b/docs/content/reference/inspectors/source/_index.md @@ -12,4 +12,3 @@ Inspectors in this layer: - [File tree]({{< relref "file-tree.md" >}}): Map files, directories, extensions, regions, and filename conventions, opening no files. - [File content shape]({{< relref "file-content-shape.md" >}}): Profile selected files by text, tabular, and tree content structure. -- [Document shape]({{< relref "document-shape.md" >}}): Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. diff --git a/docs/content/reference/inspectors/source/document-shape.md b/docs/content/reference/inspectors/source/document-shape.md deleted file mode 100644 index 93742e7..0000000 --- a/docs/content/reference/inspectors/source/document-shape.md +++ /dev/null @@ -1,30 +0,0 @@ -+++ -title = "Document shape" -weight = 30 -+++ - - - -## Inspector ID - -`document_shape` - -## Layer - -source - -## Purpose - -Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming. - -## Usage - -Inspectors emit evidence: counts and distributions, for the reader to judge. Run this one with: - -``` -katalyst inspect --inspector document_shape -``` - -## Worked example - -{{< katalyst-example-full "inspect-source-shape" >}} diff --git a/docs/generated/examples/inspect-source-shape.full.md b/docs/generated/examples/inspect-source-shape.full.md index 70604ac..5d22bde 100644 --- a/docs/generated/examples/inspect-source-shape.full.md +++ b/docs/generated/examples/inspect-source-shape.full.md @@ -1,4 +1,4 @@ -Pointed at a bare directory (no project), `inspect` runs the raw-source inspectors. `document_shape` clusters files by a composite fingerprint, so a shared convention shows up as one class and the stragglers as outliers. +Pointed at a bare directory (no project), `inspect` runs the raw-source inspectors. `file_content_shape` opens a selected slice and reports the common text, tabular, or tree structure without proposing collections. ### Input @@ -68,18 +68,65 @@ status: read ### Command ```console -$ katalyst inspect ./wiki --inspector document_shape +$ katalyst inspect ./wiki --inspector file_content_shape --select "ext = \".md\"" # Inspection report: ./wiki ## Structural -### document_shape (n=5) - -_Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming._ - -- classes: - - class=P1 features=[ext:.md, casing:kebab, fmkey:author, fmkey:status, fmkey:title, sec:Review] members=[dune.md, foundation.md, neuromancer.md, snow-crash.md] size=4 -- outliers: - - features=[ext:.md, casing:other, fmkey:status, fmkey:title] label=Dune Messiah.md +### file_content_shape (n=5) + +_Profile selected files by text, tabular, and tree content structure._ + +---------------------------------------- +selection: + expression : ext = ".md" + files : 5 + directories : 1 + readable : 5 + unsupported : 0 + parse failures: 0 + +---------------------------------------- +file types: + TYPE FILES + .md 5 + +---------------------------------------- +coherence: + status: coherent + +---------------------------------------- +common structure: + - 5/5 Markdown files have an H1 + - 4/5 Markdown files have frontmatter key author + - 5/5 Markdown files have frontmatter key status + - 5/5 Markdown files have frontmatter key title + - 4/5 Markdown files have section Review + +---------------------------------------- +variation: + - frontmatter key author appears in 4/5 Markdown files + +---------------------------------------- +text: + files : 5 + with H1: 5 + frontmatter keys: + KEY FILES + status 5 + title 5 + author 4 + +---------------------------------------- +tabular: + no CSV files selected + +---------------------------------------- +tree: + no JSON files selected + +---------------------------------------- +read/parse issues: + none ``` diff --git a/docs/generated/examples/inspect-source-shape.txt b/docs/generated/examples/inspect-source-shape.txt index a5d70a3..2c4f3d1 100644 --- a/docs/generated/examples/inspect-source-shape.txt +++ b/docs/generated/examples/inspect-source-shape.txt @@ -2,11 +2,58 @@ ## Structural -### document_shape (n=5) +### file_content_shape (n=5) -_Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming._ +_Profile selected files by text, tabular, and tree content structure._ -- classes: - - class=P1 features=[ext:.md, casing:kebab, fmkey:author, fmkey:status, fmkey:title, sec:Review] members=[dune.md, foundation.md, neuromancer.md, snow-crash.md] size=4 -- outliers: - - features=[ext:.md, casing:other, fmkey:status, fmkey:title] label=Dune Messiah.md +---------------------------------------- +selection: + expression : ext = ".md" + files : 5 + directories : 1 + readable : 5 + unsupported : 0 + parse failures: 0 + +---------------------------------------- +file types: + TYPE FILES + .md 5 + +---------------------------------------- +coherence: + status: coherent + +---------------------------------------- +common structure: + - 5/5 Markdown files have an H1 + - 4/5 Markdown files have frontmatter key author + - 5/5 Markdown files have frontmatter key status + - 5/5 Markdown files have frontmatter key title + - 4/5 Markdown files have section Review + +---------------------------------------- +variation: + - frontmatter key author appears in 4/5 Markdown files + +---------------------------------------- +text: + files : 5 + with H1: 5 + frontmatter keys: + KEY FILES + status 5 + title 5 + author 4 + +---------------------------------------- +tabular: + no CSV files selected + +---------------------------------------- +tree: + no JSON files selected + +---------------------------------------- +read/parse issues: + none diff --git a/internal/examples/examples.go b/internal/examples/examples.go index 5c34bad..9f9752a 100644 --- a/internal/examples/examples.go +++ b/internal/examples/examples.go @@ -234,12 +234,12 @@ func All() []Example { }, { ID: "inspect-source-shape", - Title: "Cluster a raw directory by shape", - Summary: "The raw-source document_shape inspector groups files into candidate collections.", - Doc: "Pointed at a bare directory (no project), `inspect` runs the raw-source inspectors. `document_shape` clusters files by a composite fingerprint, so a shared convention shows up as one class and the stragglers as outliers.", + Title: "Profile selected raw files by content shape", + Summary: "The raw-source file_content_shape inspector profiles a selected slice of files.", + Doc: "Pointed at a bare directory (no project), `inspect` runs the raw-source inspectors. `file_content_shape` opens a selected slice and reports the common text, tabular, or tree structure without proposing collections.", Weight: 60, Files: wikiCorpus, - Args: []string{"inspect", "./wiki", "--inspector", "document_shape"}, + Args: []string{"inspect", "./wiki", "--inspector", "file_content_shape", "--select", `ext = ".md"`}, }, { ID: "inspect-collection-fields", diff --git a/internal/examples/testdata/inspect-source-shape.md b/internal/examples/testdata/inspect-source-shape.md index 1d3ed28..2e12e50 100644 --- a/internal/examples/testdata/inspect-source-shape.md +++ b/internal/examples/testdata/inspect-source-shape.md @@ -1,4 +1,4 @@ -Pointed at a bare directory (no project), `inspect` runs the raw-source inspectors. `document_shape` clusters files by a composite fingerprint, so a shared convention shows up as one class and the stragglers as outliers. +Pointed at a bare directory (no project), `inspect` runs the raw-source inspectors. `file_content_shape` opens a selected slice and reports the common text, tabular, or tree structure without proposing collections. ## Input @@ -68,18 +68,65 @@ status: read ## Command ```console -$ katalyst inspect ./wiki --inspector document_shape +$ katalyst inspect ./wiki --inspector file_content_shape --select "ext = \".md\"" # Inspection report: ./wiki ## Structural -### document_shape (n=5) - -_Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming._ - -- classes: - - class=P1 features=[ext:.md, casing:kebab, fmkey:author, fmkey:status, fmkey:title, sec:Review] members=[dune.md, foundation.md, neuromancer.md, snow-crash.md] size=4 -- outliers: - - features=[ext:.md, casing:other, fmkey:status, fmkey:title] label=Dune Messiah.md +### file_content_shape (n=5) + +_Profile selected files by text, tabular, and tree content structure._ + +---------------------------------------- +selection: + expression : ext = ".md" + files : 5 + directories : 1 + readable : 5 + unsupported : 0 + parse failures: 0 + +---------------------------------------- +file types: + TYPE FILES + .md 5 + +---------------------------------------- +coherence: + status: coherent + +---------------------------------------- +common structure: + - 5/5 Markdown files have an H1 + - 4/5 Markdown files have frontmatter key author + - 5/5 Markdown files have frontmatter key status + - 5/5 Markdown files have frontmatter key title + - 4/5 Markdown files have section Review + +---------------------------------------- +variation: + - frontmatter key author appears in 4/5 Markdown files + +---------------------------------------- +text: + files : 5 + with H1: 5 + frontmatter keys: + KEY FILES + status 5 + title 5 + author 4 + +---------------------------------------- +tabular: + no CSV files selected + +---------------------------------------- +tree: + no JSON files selected + +---------------------------------------- +read/parse issues: + none ``` diff --git a/internal/inspect/inspectors_source.go b/internal/inspect/inspectors_source.go index 7b650bd..f41a082 100644 --- a/internal/inspect/inspectors_source.go +++ b/internal/inspect/inspectors_source.go @@ -1,11 +1,6 @@ package inspect -import ( - "path" - "strings" - - "github.com/abegong/katalyst/internal/storage" -) +import "github.com/abegong/katalyst/internal/storage" // FileTree is the shallow, cheap raw-source inspector: a deterministic // filesystem map from path metadata. It opens no files. Filesystem-specific. @@ -20,61 +15,6 @@ func (FileTree) Inspect(v SourceView, p Params) Evidence { return Evidence{Inspector: "file_tree", Scope: v.root, N: v.N(), Data: buildFileTreeSummary(v)} } -// DocumentShape clusters markdown files into candidate collections on a -// composite fingerprint: frontmatter keys, body section skeleton, and file -// type/naming, so a class agrees on metadata AND structure AND convention, not -// frontmatter alone. The renamed, widened frontmatter_shape. Filesystem-specific. -type DocumentShape struct{} - -func (DocumentShape) Name() string { return "document_shape" } - -func (DocumentShape) AppliesTo(t storage.StorageType) bool { return t == storage.Filesystem } - -func (DocumentShape) Inspect(v SourceView, p Params) Evidence { - docs := v.markdown() - profiles := make([]Profile, 0, len(docs)) - for _, sd := range docs { - profiles = append(profiles, Profile{Label: sd.rel, Features: shapeFeatures(sd)}) - } - return Evidence{Inspector: "document_shape", Scope: v.root, N: len(docs), Data: summarize(profiles, p)} -} - -// dirFeatures turns one directory's file list into summarizer feature tokens: -// the extensions present, the dominant filename casing, and a spaces marker. -func dirFeatures(refs []string) []string { - meta := fileMetadata(refs) - var feats []string - for _, e := range sortedKeys(meta["extensions"].(map[string]any)) { - feats = append(feats, "ext:"+e) - } - feats = append(feats, "casing:"+dominant(meta["casing"].(map[string]any))) - if meta["with_spaces"].(int) > 0 { - feats = append(feats, "spaces") - } - return feats -} - -// shapeFeatures builds a file's composite fingerprint across three dimensions: -// file type/naming, frontmatter keys, and body section skeleton. -func shapeFeatures(sd sourceDoc) []string { - feats := []string{ - "ext:" + strings.ToLower(path.Ext(sd.rel)), - "casing:" + nameCasing(sd.rel), - } - if sd.doc == nil { - return feats - } - for _, k := range sortedKeys(sd.doc.Meta) { - feats = append(feats, "fmkey:"+k) - } - for _, h := range headings(sd.doc.Body) { - if h.level >= 2 { - feats = append(feats, "sec:"+h.text) - } - } - return feats -} - // dominant returns the highest-count key in a histogram, ties broken by key for // determinism. func dominant(hist map[string]any) string { @@ -86,16 +26,3 @@ func dominant(hist map[string]any) string { } return best } - -// nameCasing classifies a file's stem as kebab, snake, or other. -func nameCasing(rel string) string { - stem := strings.TrimSuffix(path.Base(rel), path.Ext(rel)) - switch { - case kebabPattern.MatchString(stem): - return "kebab" - case snakePattern.MatchString(stem): - return "snake" - default: - return "other" - } -} diff --git a/internal/inspect/params.go b/internal/inspect/params.go index 7ff103e..e6720b5 100644 --- a/internal/inspect/params.go +++ b/internal/inspect/params.go @@ -1,25 +1,9 @@ package inspect -import "fmt" - -// collapseMode selects how the summarizer decides class membership. -type collapseMode int - -const ( - // thresholdMode merges profiles whose similarity meets a fixed threshold. - thresholdMode collapseMode = iota - // budgetMode lowers the threshold until the class count fits maxClasses. - budgetMode -) - -// Params carries inspector parameters. Today it holds only the summarizer's -// collapse tolerance, the first inspector parameter. Inspectors that don't -// summarize ignore it. +// Params carries inspector parameters. Inspectors that do not need a parameter +// ignore it. type Params struct { - mode collapseMode - threshold float64 - maxClasses int - Selection Selection + Selection Selection } // Selection describes the path-derived file subset an inspector should use. @@ -35,52 +19,3 @@ func (p Params) WithSelection(selection Selection) Params { p.Selection = selection return p } - -// detailThresholds maps the named --detail levels to similarity thresholds. -// exact keeps only identical profiles together; coarse merges aggressively. -var detailThresholds = map[string]float64{ - "exact": 1.0, - "grouped": 0.6, - "coarse": 0.3, -} - -// ParseParams resolves the three collapse-tolerance forms into Params. The -// forms are mutually exclusive: a caller passes at most one of a named detail -// level, a 0–1 similarity proportion, or a max-classes budget. With none set, -// the default is the `grouped` named level. Unset sentinels: detail "", -// similarity < 0, maxClasses <= 0. -func ParseParams(detail string, similarity float64, maxClasses int) (Params, error) { - set := 0 - if detail != "" { - set++ - } - if similarity >= 0 { - set++ - } - if maxClasses > 0 { - set++ - } - if set > 1 { - return Params{}, fmt.Errorf("--detail, --similarity, and --max-classes are mutually exclusive") - } - - switch { - case maxClasses > 0: - return Params{mode: budgetMode, maxClasses: maxClasses}, nil - case similarity >= 0: - if similarity > 1 { - return Params{}, fmt.Errorf("--similarity: must be between 0 and 1 (got %v)", similarity) - } - return Params{mode: thresholdMode, threshold: similarity}, nil - default: - level := detail - if level == "" { - level = "grouped" - } - thr, ok := detailThresholds[level] - if !ok { - return Params{}, fmt.Errorf("--detail: must be exact, grouped, or coarse (got %q)", level) - } - return Params{mode: thresholdMode, threshold: thr}, nil - } -} diff --git a/internal/inspect/params_test.go b/internal/inspect/params_test.go deleted file mode 100644 index e2b3962..0000000 --- a/internal/inspect/params_test.go +++ /dev/null @@ -1,49 +0,0 @@ -package inspect - -import "testing" - -func TestParseParams_defaultGrouped(t *testing.T) { - p, err := ParseParams("", -1, 0) - if err != nil { - t.Fatalf("default: %v", err) - } - if p.mode != thresholdMode { - t.Fatalf("mode = %v, want thresholdMode", p.mode) - } - if p.threshold != detailThresholds["grouped"] { - t.Errorf("threshold = %v, want grouped %v", p.threshold, detailThresholds["grouped"]) - } -} - -func TestParseParams_mutuallyExclusive(t *testing.T) { - if _, err := ParseParams("coarse", 0.5, 0); err == nil { - t.Error("expected error for detail + similarity") - } - if _, err := ParseParams("", 0.5, 3); err == nil { - t.Error("expected error for similarity + max-classes") - } - if _, err := ParseParams("exact", -1, 4); err == nil { - t.Error("expected error for detail + max-classes") - } -} - -func TestParseParams_eachForm(t *testing.T) { - if p, err := ParseParams("exact", -1, 0); err != nil || p.threshold != 1.0 { - t.Errorf("exact: p=%+v err=%v", p, err) - } - if p, err := ParseParams("", 0.25, 0); err != nil || p.mode != thresholdMode || p.threshold != 0.25 { - t.Errorf("similarity: p=%+v err=%v", p, err) - } - if p, err := ParseParams("", -1, 5); err != nil || p.mode != budgetMode || p.maxClasses != 5 { - t.Errorf("budget: p=%+v err=%v", p, err) - } -} - -func TestParseParams_invalid(t *testing.T) { - if _, err := ParseParams("nope", -1, 0); err == nil { - t.Error("expected error for unknown detail level") - } - if _, err := ParseParams("", 2.0, 0); err == nil { - t.Error("expected error for similarity > 1") - } -} diff --git a/internal/inspect/registry.go b/internal/inspect/registry.go index cde3344..1ce50b5 100644 --- a/internal/inspect/registry.go +++ b/internal/inspect/registry.go @@ -88,14 +88,6 @@ func Descriptors() []Descriptor { Title: "File content shape", Summary: "Profile selected files by text, tabular, and tree content structure.", }, - { - Name: "document_shape", - Layer: "source", - Family: "structural", - Slug: "document-shape", - Title: "Document shape", - Summary: "Cluster files into candidate collections by a composite fingerprint of frontmatter, body structure, and file naming.", - }, { Name: "object_fields", Layer: "collection", @@ -120,7 +112,6 @@ func SourceInspectors() []SourceInspector { return []SourceInspector{ FileTree{}, FileContentShape{}, - DocumentShape{}, } } diff --git a/internal/inspect/render_test.go b/internal/inspect/render_test.go index 5ba0753..d348870 100644 --- a/internal/inspect/render_test.go +++ b/internal/inspect/render_test.go @@ -12,7 +12,17 @@ import ( // so the renderer is exercised without depending on any inspector's internals. func renderInput() []inspect.Evidence { return []inspect.Evidence{ - {Inspector: "document_shape", Scope: "books", N: 3, Data: map[string]any{"classes": []any{}, "outliers": []any{}}}, + {Inspector: "file_tree", Scope: "books", N: 3, Data: map[string]any{ + "file_count": 3, + "dir_count": 1, + "max_depth": 1, + "extensions": map[string]any{".md": 3}, + "tree_entries": []any{}, + "top_level_regions": []any{}, + "directory_summaries": []any{}, + "representative_paths": []any{}, + "naming": map[string]any{}, + }}, {Inspector: "object_fields", Scope: "books", N: 3, Data: map[string]any{"title": map[string]any{"present": 3}}}, } } @@ -20,8 +30,8 @@ func renderInput() []inspect.Evidence { func TestRenderMarkdown_groupsByFamilyWithCounts(t *testing.T) { md := inspect.RenderMarkdown(renderInput(), 0) for _, want := range []string{ - "## Structural", - "### document_shape (n=3)", + "## Filesystem", + "### file_tree (n=3)", "## Object", "### object_fields (n=3)", "- present: 3", @@ -149,11 +159,11 @@ func TestRenderJSON_roundTrips(t *testing.T) { t.Errorf("record missing %q: %v", key, first) } } - if first["inspector"] != "document_shape" { - t.Errorf("inspector = %v, want document_shape", first["inspector"]) + if first["inspector"] != "file_tree" { + t.Errorf("inspector = %v, want file_tree", first["inspector"]) } - if first["description"] != inspect.Summary("document_shape") { - t.Errorf("description = %v, want %q", first["description"], inspect.Summary("document_shape")) + if first["description"] != inspect.Summary("file_tree") { + t.Errorf("description = %v, want %q", first["description"], inspect.Summary("file_tree")) } } diff --git a/internal/inspect/source.go b/internal/inspect/source.go index 2ba5d9a..094b18d 100644 --- a/internal/inspect/source.go +++ b/internal/inspect/source.go @@ -6,50 +6,39 @@ import ( "path" "path/filepath" "strings" - - "github.com/abegong/katalyst/internal/codec/markdownbodytext" ) // sourceFile is one file discovered by a SourceView walk: cheap path-level -// metadata only. Markdown content is parsed lazily (see markdown). +// metadata only. Content inspectors open selected files explicitly. type sourceFile struct { rel string // path relative to the root, slash-separated dir string // directory relative to the root ("." at the top level) ext string // lowercased extension, including the dot } -// sourceDoc is a parsed markdown file in a SourceView. -type sourceDoc struct { - rel string - dir string - doc *markdownbodytext.Document // nil when the file failed to read or parse -} - -// mdCache is the lazily-populated markdown parse, shared across value copies of -// a SourceView via a pointer so the parse happens at most once per view. -type mdCache struct { - loaded bool - count int - docs []sourceDoc +// readCounter is shared across value copies of a SourceView so tests can assert +// that path-only inspectors open no files. +type readCounter struct { + count int } // SourceView is the raw-source layer's addressing surface: a filesystem tree // walked once into per-file metadata, addressed by backend-native reference // (the relative path). Path-level inspectors (file_tree) read only this -// metadata and open no files; content inspectors trigger a one-time markdown -// parse. Filesystem-only for now; generalizing the walk into the storage layer -// is future work. +// metadata and open no files; content inspectors read selected files explicitly. +// Filesystem-only for now; generalizing the walk into the storage layer is +// future work. type SourceView struct { root string files []sourceFile - md *mdCache + reads *readCounter } // NewSourceView walks root, collecting every non-hidden file's path metadata // without opening it. Hidden entries (dot-prefixed, e.g. .git, .katalyst) are // skipped as store noise. func NewSourceView(root string) (SourceView, error) { - v := SourceView{root: root, md: &mdCache{}} + v := SourceView{root: root, reads: &readCounter{}} err := filepath.WalkDir(root, func(p string, d fs.DirEntry, err error) error { if err != nil { return err @@ -88,15 +77,14 @@ func (v SourceView) Root() string { return v.root } // N is the number of files discovered. func (v SourceView) N() int { return len(v.files) } -// ParseCount reports how many files the view has opened to parse, 0 until a -// content inspector triggers the markdown parse, which proves file_tree opens -// nothing. -func (v SourceView) ParseCount() int { return v.md.count } +// ParseCount reports how many files the view has opened for content inspection, +// which proves file_tree opens nothing. +func (v SourceView) ParseCount() int { return v.reads.count } // readFile opens one discovered file by relative path and records the read as // content inspection work. func (v SourceView) readFile(rel string) ([]byte, error) { - v.md.count++ + v.reads.count++ return os.ReadFile(filepath.Join(v.root, filepath.FromSlash(rel))) } @@ -108,26 +96,3 @@ func (v SourceView) refsByDir() map[string][]string { } return out } - -// markdown lazily reads and parses every .md file once, caching on the shared -// mdCache so repeated content inspectors don't re-read disk. -func (v SourceView) markdown() []sourceDoc { - if v.md.loaded { - return v.md.docs - } - for _, f := range v.files { - if f.ext != ".md" { - continue - } - v.md.count++ - sd := sourceDoc{rel: f.rel, dir: f.dir} - if src, err := os.ReadFile(filepath.Join(v.root, filepath.FromSlash(f.rel))); err == nil { - if doc, perr := markdownbodytext.Parse(src); perr == nil { - sd.doc = doc - } - } - v.md.docs = append(v.md.docs, sd) - } - v.md.loaded = true - return v.md.docs -} diff --git a/internal/inspect/source_test.go b/internal/inspect/source_test.go index af94387..360bf5f 100644 --- a/internal/inspect/source_test.go +++ b/internal/inspect/source_test.go @@ -25,8 +25,7 @@ func TestFileTree_opensNothingAndReportsFilesystemMap(t *testing.T) { t.Error("file_tree should not apply to a non-filesystem type") } - p, _ := inspect.ParseParams("exact", -1, 0) - ev := ft.Inspect(view, p) + ev := ft.Inspect(view, inspect.Params{}) if view.ParseCount() != 0 { t.Errorf("file_tree opened %d files, want 0", view.ParseCount()) } @@ -83,31 +82,3 @@ func TestFileContentShape_profilesSelectedMarkdown(t *testing.T) { t.Errorf("markdown.files = %d, want 1", got) } } - -func TestDocumentShape_clustersOnCompositeFingerprint(t *testing.T) { - dir := t.TempDir() - // Identical across all dimensions → one class. - writeFile(t, dir, "books/dune.md", "---\ntitle: Dune\nrating: 5\n---\n# Dune\n\n## Review\n") - writeFile(t, dir, "books/messiah.md", "---\ntitle: Messiah\nrating: 4\n---\n# Messiah\n\n## Review\n") - // Same frontmatter keys, different body skeleton (Summary, not Review) → - // a different class, proving clustering is not on frontmatter alone. - writeFile(t, dir, "books/notes.md", "---\ntitle: Notes\nrating: 3\n---\n# Notes\n\n## Summary\n") - - view, err := inspect.NewSourceView(dir) - if err != nil { - t.Fatalf("NewSourceView: %v", err) - } - p, _ := inspect.ParseParams("exact", -1, 0) - ev := inspect.DocumentShape{}.Inspect(view, p) - - classes := ev.Data["classes"].([]any) - if len(classes) != 1 { - t.Fatalf("classes = %d, want 1 (dune+messiah)", len(classes)) - } - if classes[0].(map[string]any)["size"].(int) != 2 { - t.Errorf("class size = %v, want 2", classes[0].(map[string]any)["size"]) - } - if outliers := ev.Data["outliers"].([]any); len(outliers) != 1 { - t.Errorf("outliers = %d, want 1 (notes, distinct body)", len(outliers)) - } -} diff --git a/internal/inspect/summarize.go b/internal/inspect/summarize.go deleted file mode 100644 index 58814b2..0000000 --- a/internal/inspect/summarize.go +++ /dev/null @@ -1,120 +0,0 @@ -package inspect - -import ( - "fmt" - "sort" -) - -// Profile is one unit's fingerprint for the summarizer: a label (the directory -// or file it describes) and an unordered set of feature tokens. Two profiles -// are compared by Jaccard similarity over their feature sets. -type Profile struct { - Label string - Features []string -} - -// summarize collapses profiles into named classes so output is proportional to -// the number of distinct profiles, not the number of units. Profiles whose -// similarity meets the Params tolerance share a class; singletons are reported -// separately as outliers. Shared by the file_tree* and document_shape -// inspectors. -func summarize(profiles []Profile, p Params) map[string]any { - sorted := append([]Profile(nil), profiles...) - sort.Slice(sorted, func(i, j int) bool { return sorted[i].Label < sorted[j].Label }) - - var classes []class - if p.mode == budgetMode { - classes = clusterToBudget(sorted, p.maxClasses) - } else { - classes = cluster(sorted, p.threshold) - } - - classList := []any{} - outliers := []any{} - n := 0 - for _, c := range classes { - if len(c.members) == 1 { - outliers = append(outliers, map[string]any{ - "label": c.members[0], - "features": c.rep, - }) - continue - } - n++ - classList = append(classList, map[string]any{ - "class": fmt.Sprintf("P%d", n), - "size": len(c.members), - "features": c.rep, - "members": c.members, - }) - } - return map[string]any{"classes": classList, "outliers": outliers} -} - -// class is one cluster: its representative feature set (the first member's) and -// the labels of its members. -type class struct { - rep []string - members []string -} - -// cluster greedily groups profiles: each profile joins the first existing class -// whose representative is at least `threshold` similar, else starts a new class. -func cluster(profiles []Profile, threshold float64) []class { - var classes []class - for _, pr := range profiles { - joined := false - for i := range classes { - if jaccard(classes[i].rep, pr.Features) >= threshold { - classes[i].members = append(classes[i].members, pr.Label) - joined = true - break - } - } - if !joined { - classes = append(classes, class{rep: pr.Features, members: []string{pr.Label}}) - } - } - return classes -} - -// clusterToBudget lowers the similarity threshold (1.00 → 0.00 in 0.05 steps) -// until the cluster count fits maxClasses, returning the tightest grouping that -// does. Threshold 0 merges everything into one class, so this always converges. -func clusterToBudget(profiles []Profile, maxClasses int) []class { - for step := 100; step >= 0; step -= 5 { - c := cluster(profiles, float64(step)/100) - if len(c) <= maxClasses { - return c - } - } - return cluster(profiles, 0) -} - -// jaccard is the Jaccard similarity of two feature sets: |A∩B| / |A∪B|. Two -// empty sets are identical (1). -func jaccard(a, b []string) float64 { - sa, sb := toSet(a), toSet(b) - if len(sa) == 0 && len(sb) == 0 { - return 1 - } - inter := 0 - for k := range sa { - if sb[k] { - inter++ - } - } - union := len(sa) + len(sb) - inter - if union == 0 { - return 1 - } - return float64(inter) / float64(union) -} - -func toSet(ss []string) map[string]bool { - out := make(map[string]bool, len(ss)) - for _, s := range ss { - out[s] = true - } - return out -} diff --git a/internal/inspect/summarize_test.go b/internal/inspect/summarize_test.go deleted file mode 100644 index a368c73..0000000 --- a/internal/inspect/summarize_test.go +++ /dev/null @@ -1,72 +0,0 @@ -package inspect - -import ( - "fmt" - "testing" -) - -func TestSummarize_classesAndOutliers(t *testing.T) { - var profiles []Profile - for i := 0; i < 190; i++ { - profiles = append(profiles, Profile{Label: fmt.Sprintf("a%03d", i), Features: []string{"md", "kebab"}}) - } - for i := 0; i < 7; i++ { - profiles = append(profiles, Profile{Label: fmt.Sprintf("b%03d", i), Features: []string{"md", "snake"}}) - } - profiles = append(profiles, - Profile{Label: "x1", Features: []string{"png"}}, - Profile{Label: "x2", Features: []string{"pdf"}}, - Profile{Label: "x3", Features: []string{"csv"}}, - ) - - p, _ := ParseParams("exact", -1, 0) - out := summarize(profiles, p) - - classes := out["classes"].([]any) - if len(classes) != 2 { - t.Fatalf("classes = %d, want 2", len(classes)) - } - if outliers := out["outliers"].([]any); len(outliers) != 3 { - t.Errorf("outliers = %d, want 3", len(outliers)) - } - top := classes[0].(map[string]any) - if top["size"].(int) != 190 { - t.Errorf("top class size = %v, want 190", top["size"]) - } -} - -// Higher tolerance (lower threshold) collapses near-but-distinct profiles, so -// the class count drops. -func TestSummarize_higherToleranceFewerClasses(t *testing.T) { - profiles := []Profile{ - {Label: "p1", Features: []string{"a", "b", "c"}}, - {Label: "p2", Features: []string{"a", "b", "d"}}, // Jaccard with p1 = 2/4 = 0.5 - {Label: "p3", Features: []string{"x", "y", "z"}}, - } - exact, _ := ParseParams("exact", -1, 0) // threshold 1.0 - coarse, _ := ParseParams("", 0.5, 0) // threshold 0.5 - - nExact := classCount(summarize(profiles, exact)) - nCoarse := classCount(summarize(profiles, coarse)) - if nCoarse >= nExact { - t.Errorf("expected fewer classes at higher tolerance: exact=%d coarse=%d", nExact, nCoarse) - } -} - -func TestSummarize_budgetCapsClasses(t *testing.T) { - profiles := []Profile{ - {Label: "p1", Features: []string{"a"}}, - {Label: "p2", Features: []string{"b"}}, - {Label: "p3", Features: []string{"c"}}, - {Label: "p4", Features: []string{"d"}}, - } - p, _ := ParseParams("", -1, 2) - if got := classCount(summarize(profiles, p)); got > 2 { - t.Errorf("budget 2 exceeded: %d classes", got) - } -} - -// classCount is the number of distinct classes, counting singleton outliers. -func classCount(out map[string]any) int { - return len(out["classes"].([]any)) + len(out["outliers"].([]any)) -} diff --git a/product/specs/file-content-shape-inspector-plan.md b/product/specs/file-content-shape-inspector-plan.md index 2d523b2..afe5827 100644 --- a/product/specs/file-content-shape-inspector-plan.md +++ b/product/specs/file-content-shape-inspector-plan.md @@ -9,16 +9,13 @@ configured collection runs collection inspectors; a filesystem directory runs source inspectors. `--inspector` already narrows by registry name. - `internal/inspect/params.go` carries source and collection inspector - parameters for summarizer detail. Inspectors that do not need a parameter - ignore it. + parameters. After retiring grouping, it carries only selected-file state. - `internal/inspect/source.go` walks non-hidden filesystem paths into - `SourceView.files`, and lazily parses all `.md` files through - `SourceView.markdown`. + `SourceView.files`, and content inspectors read selected files explicitly. - `file_tree` now owns the path-only store map. The old content inspector parsed Markdown per directory and rendered generic `classes` / `outliers`. -- `document_shape` still clusters Markdown files by composite fingerprint. The - new spec keeps clustering as a follow-up path, not the primary content-shape - workflow. +- `document_shape` and the summarizer-backed grouping path were retired after + review; explicit selections are the content-shape workflow. - `internal/inspect/render.go` has a custom Markdown renderer for `file_tree`; all other inspectors use generic key/value rendering. @@ -45,8 +42,7 @@ invalid combinations fail before files are opened. 1. **File:** `internal/inspect/params.go`. Add a `Selection` value on `Params`, with `Label`, `Mode`, and `Pattern`. - Extend `ParseParams` or add a small wrapper so existing summarizer tests stay - focused and selection validation remains in `cmd/inspect.go`. + Keep selection validation in `cmd/inspect.go`. 2. **File:** `cmd/inspect.go`. Add `--select string`. It is valid only when the target resolves to the source layer, exactly one `--inspector` is supplied, and that inspector is @@ -168,7 +164,7 @@ suite verifies behavior. | File | Role | |---|---| | `cmd/inspect.go` | adds and validates `--select`; passes selection through params | -| `internal/inspect/params.go` | carries selection alongside summarizer parameters | +| `internal/inspect/params.go` | carries selected-file state | | `internal/inspect/selection.go` | path-derived selection resolver | | `internal/inspect/source.go` | selected file helpers and relative file reads | | `internal/inspect/filecontentshape.go` | summary builder and Markdown/CSV/JSON parsers | @@ -188,7 +184,7 @@ suite verifies behavior. | Parser scope | Markdown, CSV, JSON only | enough to prove text/tabular/tree views without dependency drift | | Selection timing | path-only before content reads | deterministic, cheap, and matches the spec boundary | | Output model | complete JSON, capped Markdown | machines get full evidence; humans get a short report | -| Clustering | deferred | explicit selections are the primary workflow for now | +| Clustering | retired | explicit selections are the primary workflow | ## Documentation updates @@ -202,5 +198,6 @@ suite verifies behavior. - HTML, XML, YAML, TOML, code AST, Markdown table, or JSON array-to-table parsers. - Content predicates in selection syntax. -- Automatic selection suggestions or clustering. +- Automatic selection suggestions. +- Automatic grouping or clustering. - Alias compatibility for `file_tree_content`; add deliberately later if needed. diff --git a/product/specs/file-content-shape-inspector-spec.md b/product/specs/file-content-shape-inspector-spec.md index cd09d41..92d1ac5 100644 --- a/product/specs/file-content-shape-inspector-spec.md +++ b/product/specs/file-content-shape-inspector-spec.md @@ -34,23 +34,18 @@ The loop is explicit: `--inspector` flag. That is the natural user-facing hook for `file_content_shape`; adding an inspector-specific subcommand would create a second invocation grammar for the same registry. -- `internal/inspect/params.go` carries inspector parameters (`--detail`, - `--similarity`, `--max-classes`) through `inspect.Params`. Inspectors that do - not use a parameter ignore it. Selection can follow that pattern if validation - keeps it scoped to `file_content_shape`. +- `internal/inspect/params.go` carries selected-file state through + `inspect.Params`. Validation keeps `--select` scoped to `file_content_shape`. - `internal/inspect/source.go` builds `SourceView.files` from path metadata and - lazily parses only `.md` files through `SourceView.markdown`. -- `internal/inspect/inspectors_source.go` implements `FileTreeContent.Inspect` - by grouping parsed Markdown documents per directory, reducing each directory - to feature tokens (`parsed`, `frontmatter`, `fmkey:`), and passing them - to `summarize`. -- `internal/inspect/registry.go` describes `file_tree_content` as Markdown-only: - "Parse markdown and profile each directory's content shape." -- `internal/inspect/render.go` renders every inspector through the same generic - key/value Markdown renderer. The output exposes `classes` and `outliers` - instead of a readable report. -- `docs/content/deep-dives/inspectors.md` describes `file_tree` and - `document_shape` as summarizing inspectors that collapse profiles into classes. + lets content inspectors read selected files explicitly. +- `internal/inspect/inspectors_source.go` keeps `file_tree` as the path-only + raw-source inspector. +- `internal/inspect/registry.go` exposes `file_content_shape` as the selected + content profiler. +- `internal/inspect/render.go` has dedicated Markdown projections for + `file_tree` and `file_content_shape`. +- `docs/content/deep-dives/inspectors.md` describes raw-source inspection as + store map plus selected content shape. That model is too narrow. Markdown is one content source, not the boundary of the second raw-source inspector. Profiling the entire directory by default also mixes @@ -68,9 +63,8 @@ The raw-source layer has two primary inspection levels: performs light parsing. It reports shared content views, common structure, variation, and read or parse issues. -The earlier `document_shape` clustering idea is deferred from the primary path. -Katalyst can add suggestion and clustering features later, but the core CLI -should first let the reader test explicit selections. +The earlier `document_shape` clustering path is retired. The core CLI lets the +reader test explicit selections instead of proposing groups. ### Command surface @@ -305,12 +299,10 @@ Variation reports meaningful differences: These sections use counts and denominators. They do not recommend a schema or collection. -### Relationship to `document_shape` +### Relationship to grouping -`document_shape` should not be the primary automatic clustering path for this -workflow. A future clustering or suggestion command can propose likely -selections, but `file_content_shape` should profile an explicit selection and -report evidence. +Automatic grouping is intentionally out of scope for this workflow. +`file_content_shape` profiles an explicit selection and reports evidence. This keeps Katalyst's primary raw-source flow deterministic and explainable: @@ -334,6 +326,7 @@ Examples of acceptable deferrals: - YAML parsing - a full query language - automatic selection suggestions +- automatic grouping or clustering ## Open Questions @@ -345,8 +338,7 @@ name. ## Documentation updates - `docs/content/deep-dives/inspectors.md`: update the raw-source model from - profile clustering to store map plus content shape. Note that clustering and - suggestions are follow-up features, not the primary flow. + profile clustering to store map plus selected content shape. - `internal/inspect/doc.go`: align the package summary if it names Markdown-only or clustering-specific behavior. - `docs/content/reference/inspectors/`: regenerate with `make docs-gen` if the From 51b8aef6bbbb394daf72c365f3514465a732cae1 Mon Sep 17 00:00:00 2001 From: Abe Gong Date: Thu, 25 Jun 2026 06:59:47 -0600 Subject: [PATCH 10/10] Fix rebase fallout --- cmd/inspectors_test.go | 4 ++-- cmd/testdata/snapshots/inspectors/show-file_content_shape.txt | 2 +- internal/inspect/filecontentshape.go | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmd/inspectors_test.go b/cmd/inspectors_test.go index 68885a2..9e6e34c 100644 --- a/cmd/inspectors_test.go +++ b/cmd/inspectors_test.go @@ -14,7 +14,7 @@ func TestInspectors_listsEveryInspectorGroupedByLayer(t *testing.T) { t.Fatalf("inspectors list: %v", err) } - for _, want := range []string{"file_tree", "document_shape", "object_fields", "markdown_body"} { + for _, want := range []string{"file_tree", "file_content_shape", "object_fields", "markdown_body"} { if !strings.Contains(stdout, want) { t.Errorf("expected inspector %q in output", want) } @@ -125,7 +125,7 @@ func TestInspectorsList_jsonArrayCoversEveryDescriptor(t *testing.T) { t.Errorf("entry %d (%s): empty layer/summary", i, d.Name) } } - for _, want := range []string{"file_tree", "document_shape", "object_fields", "markdown_body"} { + for _, want := range []string{"file_tree", "file_content_shape", "object_fields", "markdown_body"} { if !seen[want] { t.Errorf("expected inspector %q in JSON output", want) } diff --git a/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt b/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt index 50a3a43..da333b4 100644 --- a/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt +++ b/cmd/testdata/snapshots/inspectors/show-file_content_shape.txt @@ -1,5 +1,5 @@ Raw-source inspectors › File content shape -------------------------------------------- +-------------------------------------------- - inspector: file_content_shape - layer: source - family: structural diff --git a/internal/inspect/filecontentshape.go b/internal/inspect/filecontentshape.go index b710313..137eb86 100644 --- a/internal/inspect/filecontentshape.go +++ b/internal/inspect/filecontentshape.go @@ -7,8 +7,8 @@ import ( "fmt" "sort" + "github.com/abegong/katalyst/internal/codec/markdownbodytext" "github.com/abegong/katalyst/internal/storage" - "github.com/abegong/katalyst/internal/storage/collection/document" ) type contentIssue struct { @@ -70,7 +70,7 @@ func buildFileContentShape(v SourceView, sel Selection) map[string]any { switch f.ext { case ".md": mdFiles++ - doc, err := document.Parse(src) + doc, err := markdownbodytext.Parse(src) if err != nil { issues = append(issues, contentIssue{Path: f.rel, Kind: "parse_failed", Detail: err.Error()}) continue