From 0c0c7e976ac063b40f9f0f1eaf48b6bd5273ce18 Mon Sep 17 00:00:00 2001 From: flupkede Date: Mon, 1 Jun 2026 21:44:33 +0200 Subject: [PATCH 1/7] chore: add /merge and /release Claude Code slash commands Codify the project release workflow as two committed slash commands under .claude/commands/ (force-added past .gitignore, like .claude/CLAUDE.md): - /merge: README/CHANGELOG freshness checks -> commit -> validate -> push -> PR to develop -> auto-merge after CI. No tag. - /release: /merge, then promote develop -> master via a "Release vX.Y.Z" PR (protect-master allows develop), then push the vX.Y.Z tag that triggers release.yml. Includes optional post-release develop sync. Commands document the repo's real conventions: feature->develop->master flow, master branch protection, and the pre-commit version-bump-on-feature-branches rule that fixes the release version at the feature commit. Tooling-only change on a chore/ branch: no version bump, no CHANGELOG entry (CHANGELOG tracks the shipped binary's behavior). Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/commands/merge.md | 79 +++++++++++++++++++++++++++++++++++++ .claude/commands/release.md | 59 +++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 .claude/commands/merge.md create mode 100644 .claude/commands/release.md diff --git a/.claude/commands/merge.md b/.claude/commands/merge.md new file mode 100644 index 0000000..5ceff74 --- /dev/null +++ b/.claude/commands/merge.md @@ -0,0 +1,79 @@ +--- +description: Land the current feature branch on develop — README/CHANGELOG checks, commit, push, PR, auto-merge +argument-hint: [optional PR title] +allowed-tools: Bash(git:*), Bash(gh:*), Bash(cargo:*), Bash(grep:*), Read, Edit, Grep, Glob +--- + +# /merge — land the current feature branch on `develop` + +Run the project's **merge workflow**: verify docs are current, then bring the current +feature branch into `develop` through a pull request. This command does **not** tag a +release — tagging happens only in `/release`. + +## Branch & version facts (this repo) +- Flow: `feature/*` | `features/*` | `fix/*` → PR → **`develop`** → (later) PR → **`master`**. +- `master` is protected (`.github/workflows/protect-master.yml`): it accepts PRs only from + `develop` or `release/*`. +- The pre-commit hook **bumps the patch version (+1) and rebuilds the binary on feature + branches only** (`feature/*`, `features/*`, `fix/*`). On `develop`/`master`/`release`/`chore` + it runs `cargo fmt` only — no bump. So **the feature-branch commit here fixes the release + version**; it carries forward unchanged through develop, master, and the tag. + +## Guardrails +- ABORT if the current branch is `develop` or `master`. This command runs from a feature branch. +- NEVER push directly to `develop` or `master` — everything lands via a PR. +- NEVER pass `--no-verify` / `--no-gpg-sign` — let the pre-commit hook run (it bumps + rebuilds). +- Do NOT create or push a tag here. That is `/release`'s job. +- Do NOT force-push. + +## Steps + +1. **Context** + - `git rev-parse --abbrev-ref HEAD` → current branch. If `develop`/`master`, STOP with an error. + - `git fetch origin`. + - Compute the change set landing on develop: `git log origin/develop..HEAD --oneline` + plus `git status --short` for uncommitted work. If there is nothing to land, report and STOP. + +2. **README up to date?** + - Inspect the change set for user-facing changes: new/removed CLI flags or subcommands, + behavior changes, new env vars, new supported languages, new MCP tools. + - Compare against `README.md`. If anything is missing, wrong, or stale, **UPDATE `README.md`** + so it matches reality. Keep examples free of hardcoded config strings (per CLAUDE.md). + - If README already matches, state that and move on. + +3. **CHANGELOG up to date?** + - Ensure `CHANGELOG.md` has an entry for this change under a `## [X.Y.Z] - YYYY-MM-DD` + heading with `Added` / `Changed` / `Fixed` subsections describing every user-facing change. + - **Version for the heading** = current `Cargo.toml` version **+ 1 patch** (the hook will bump + to that value on commit). Read it: `grep -m1 '^version' Cargo.toml`. + - Use today's date. If an accurate entry already exists for the pending version, leave it. + +4. **Commit** + - Stage code + doc changes (`git add -A`, plus `git add -f` for any tracked-but-gitignored file). + - Commit with a clear, scoped message. End the message with: + `Co-Authored-By: Claude Opus 4.8 (1M context) ` + - Let the pre-commit hook finish (fmt → version bump → rebuild). This can take 60–120s. + +5. **Validate** (fast loop, per CLAUDE.md — do NOT run `--release`): + - `cargo fmt --all -- --check` + - `cargo check --all-targets` + - `cargo clippy --all-targets -- -D warnings` + - Fix any failures and commit again before pushing. Never push code that fails these. + +6. **Push** + - `git push -u origin HEAD`. + +7. **Open PR → develop** + - `gh pr create --base develop --head --title "" --body "<body>"`. + - Title: use `$ARGUMENTS` if provided; otherwise summarize the branch concisely. + - Body: bullet summary of changes; end with: + `🤖 Generated with [Claude Code](https://claude.com/claude-code)`. + +8. **Auto-merge after CI** + - `gh pr merge --auto --merge` so the PR lands automatically once required checks pass. + - If auto-merge is not enabled on the repo (command errors), fall back: poll + `gh pr checks <num> --watch`, then `gh pr merge --merge` once green. + +## Report +Branch, pending release version, doc updates made, PR URL, and merge status +(auto-merge enabled / merged). diff --git a/.claude/commands/release.md b/.claude/commands/release.md new file mode 100644 index 0000000..822d310 --- /dev/null +++ b/.claude/commands/release.md @@ -0,0 +1,59 @@ +--- +description: Cut a release — run /merge (feature → develop), then promote develop → master and push the version tag +argument-hint: [optional PR/release title] +allowed-tools: Bash(git:*), Bash(gh:*), Bash(cargo:*), Bash(grep:*), Read, Edit, Grep, Glob +--- + +# /release — full release: land on `develop`, promote to `master`, tag + +This is `/merge` **plus** the `develop → master` promotion and the version-tag push that +triggers the build/publish pipeline. + +## Branch & version facts (this repo) +- Flow: `feature/*` → PR → **`develop`** → PR → **`master`** → push tag `vX.Y.Z`. +- `master` is protected: PRs to it may come **only** from `develop` or `release/*` + (`.github/workflows/protect-master.yml`). +- Pushing a `vX.Y.Z` tag triggers `.github/workflows/release.yml` (builds Windows/Linux/macOS + archives, plain + `-with-csharp`, and publishes the GitHub release). **Push the tag only + AFTER the develop→master PR has merged.** +- The version is fixed by the feature-branch commit (the pre-commit hook bumps only on + feature branches). develop/master merges and the tag all carry that same version. + +## Guardrails +- NEVER use `--no-verify`. NEVER force-push shared branches. +- Push the tag exactly once, only after master has the release commit. +- If CI fails at any gate, STOP and report — do not promote or tag a red build. + +## Part 1 — land on `develop` (the `/merge` workflow) +Execute every step of **`/merge`** (README/CHANGELOG checks → commit → push → PR → auto-merge +to `develop`). Then **wait for the develop PR to actually merge** (auto-merge waits on CI): +- Get the PR number, then poll `gh pr view <num> --json state,mergedAt,mergeStateStatus` + until `state` is `MERGED`. +- If checks fail, STOP and report. Do not proceed to Part 2. + +## Part 2 — promote `develop` → `master` +1. `git fetch origin && git checkout develop && git pull --ff-only origin develop`. +2. Determine the release version: `VERSION=v$(grep -m1 '^version' Cargo.toml | sed -E 's/.*"(.+)".*/\1/')`. +3. Open the release PR (source `develop`, which protect-master allows): + - `gh pr create --base master --head develop --title "Release $VERSION — <summary>" --body "<body>"`. + - Title: prefix `Release $VERSION — ` then a short summary (or `$ARGUMENTS` if provided), + matching history (e.g. `Release v1.0.142 — serve responsive during warmup`). + - Body ends with: `🤖 Generated with [Claude Code](https://claude.com/claude-code)`. +4. `gh pr merge --auto --merge`. Wait until `state` is `MERGED` (poll as in Part 1). + If auto-merge is unavailable, `gh pr checks <num> --watch` then `gh pr merge --merge`. + If CI fails, STOP. + +## Part 3 — tag the release +1. `git fetch origin && git checkout master && git pull --ff-only origin master`. +2. Confirm the version on master matches: `grep -m1 '^version' Cargo.toml` equals `$VERSION` (minus the `v`). + If it does not match, STOP and report (do not guess a tag). +3. `git tag "$VERSION" && git push origin "$VERSION"` → triggers `release.yml`. +4. Report the pushed tag and remind the user to watch the Actions "Release" run for artifacts. + +## Part 4 — keep `develop` in sync (only if needed) +If `master` ended up ahead of `develop` (e.g. a CHANGELOG/version edit merged only on master), +open a sync PR `master → develop` (or fast-forward develop) — matching the repo's post-release +sync convention (e.g. PR #90 "sync: backfill CHANGELOG … from master"). Skip if already in sync. + +## Report +develop PR URL, release PR URL, tag pushed (`vX.Y.Z`), final version, and sync action (if any). From ea183c80f2d9c57d5f7d4502485fb7dd2c8b4ad7 Mon Sep 17 00:00:00 2001 From: flupkede <flupkede@users.noreply.github.com> Date: Mon, 1 Jun 2026 21:47:37 +0200 Subject: [PATCH 2/7] chore: address review remarks on /merge and /release commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - /merge: abort unless on feature/*|features/*|fix/* (the only branches the pre-commit hook version-bumps) — closes the gap where running from a non-bumping branch silently broke the version/CHANGELOG premise. - Clarify CHANGELOG heading version math for multi-commit landings (hook bumps +1 per commit; verify heading matches Cargo.toml after the final commit). - Capture PR numbers explicitly (gh pr view --json number) before merge/poll. - /release: fetch --tags and guard against a double release (stop if the tag already exists locally or on origin). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- .claude/commands/merge.md | 22 ++++++++++++++++------ .claude/commands/release.md | 20 ++++++++++++-------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/.claude/commands/merge.md b/.claude/commands/merge.md index 5ceff74..3931644 100644 --- a/.claude/commands/merge.md +++ b/.claude/commands/merge.md @@ -20,7 +20,10 @@ release — tagging happens only in `/release`. version**; it carries forward unchanged through develop, master, and the tag. ## Guardrails -- ABORT if the current branch is `develop` or `master`. This command runs from a feature branch. +- ABORT unless the current branch matches `feature/*`, `features/*`, or `fix/*` — i.e. the + branches the pre-commit hook version-bumps. Never run from `develop`, `master`, `release/*`, + or `chore/*`: on those the hook does **not** bump, so the version/CHANGELOG premise below + would silently break. - NEVER push directly to `develop` or `master` — everything lands via a PR. - NEVER pass `--no-verify` / `--no-gpg-sign` — let the pre-commit hook run (it bumps + rebuilds). - Do NOT create or push a tag here. That is `/release`'s job. @@ -29,7 +32,8 @@ release — tagging happens only in `/release`. ## Steps 1. **Context** - - `git rev-parse --abbrev-ref HEAD` → current branch. If `develop`/`master`, STOP with an error. + - `git rev-parse --abbrev-ref HEAD` → current branch. If it is NOT `feature/*`, `features/*`, + or `fix/*`, STOP with an error (see Guardrails). - `git fetch origin`. - Compute the change set landing on develop: `git log origin/develop..HEAD --oneline` plus `git status --short` for uncommitted work. If there is nothing to land, report and STOP. @@ -44,8 +48,12 @@ release — tagging happens only in `/release`. 3. **CHANGELOG up to date?** - Ensure `CHANGELOG.md` has an entry for this change under a `## [X.Y.Z] - YYYY-MM-DD` heading with `Added` / `Changed` / `Fixed` subsections describing every user-facing change. - - **Version for the heading** = current `Cargo.toml` version **+ 1 patch** (the hook will bump - to that value on commit). Read it: `grep -m1 '^version' Cargo.toml`. + - **Version for the heading**: the hook bumps the patch by +1 on **every** feature-branch + commit where the working-tree version still equals HEAD's. The most reliable approach is to + land this branch in a **single commit** — then the heading version = current + `Cargo.toml` version + 1 (`grep -m1 '^version' Cargo.toml`). If you commit more than once, + the version advances once per commit; after the final commit, read the actual + `Cargo.toml` version and make sure the CHANGELOG heading matches it (fix it if not). - Use today's date. If an accurate entry already exists for the pending version, leave it. 4. **Commit** @@ -68,11 +76,13 @@ release — tagging happens only in `/release`. - Title: use `$ARGUMENTS` if provided; otherwise summarize the branch concisely. - Body: bullet summary of changes; end with: `🤖 Generated with [Claude Code](https://claude.com/claude-code)`. + - Capture the PR number for the next step: + `PR=$(gh pr view --json number --jq .number)`. 8. **Auto-merge after CI** - - `gh pr merge --auto --merge` so the PR lands automatically once required checks pass. + - `gh pr merge "$PR" --auto --merge` so the PR lands automatically once required checks pass. - If auto-merge is not enabled on the repo (command errors), fall back: poll - `gh pr checks <num> --watch`, then `gh pr merge --merge` once green. + `gh pr checks "$PR" --watch`, then `gh pr merge "$PR" --merge` once green. ## Report Branch, pending release version, doc updates made, PR URL, and merge status diff --git a/.claude/commands/release.md b/.claude/commands/release.md index 822d310..16f2641 100644 --- a/.claude/commands/release.md +++ b/.claude/commands/release.md @@ -27,8 +27,8 @@ triggers the build/publish pipeline. ## Part 1 — land on `develop` (the `/merge` workflow) Execute every step of **`/merge`** (README/CHANGELOG checks → commit → push → PR → auto-merge to `develop`). Then **wait for the develop PR to actually merge** (auto-merge waits on CI): -- Get the PR number, then poll `gh pr view <num> --json state,mergedAt,mergeStateStatus` - until `state` is `MERGED`. +- Capture the PR number (`PR=$(gh pr view --json number --jq .number)`), then poll + `gh pr view "$PR" --json state,mergedAt,mergeStateStatus` until `state` is `MERGED`. - If checks fail, STOP and report. Do not proceed to Part 2. ## Part 2 — promote `develop` → `master` @@ -39,16 +39,20 @@ to `develop`). Then **wait for the develop PR to actually merge** (auto-merge wa - Title: prefix `Release $VERSION — ` then a short summary (or `$ARGUMENTS` if provided), matching history (e.g. `Release v1.0.142 — serve responsive during warmup`). - Body ends with: `🤖 Generated with [Claude Code](https://claude.com/claude-code)`. -4. `gh pr merge --auto --merge`. Wait until `state` is `MERGED` (poll as in Part 1). - If auto-merge is unavailable, `gh pr checks <num> --watch` then `gh pr merge --merge`. - If CI fails, STOP. + - Capture the PR number: `RELEASE_PR=$(gh pr view develop --json number --jq .number)`. +4. `gh pr merge "$RELEASE_PR" --auto --merge`. Wait until `state` is + `MERGED` (poll as in Part 1). If auto-merge is unavailable, `gh pr checks "$RELEASE_PR" --watch` + then `gh pr merge "$RELEASE_PR" --merge`. If CI fails, STOP. ## Part 3 — tag the release -1. `git fetch origin && git checkout master && git pull --ff-only origin master`. +1. `git fetch origin --tags && git checkout master && git pull --ff-only origin master`. 2. Confirm the version on master matches: `grep -m1 '^version' Cargo.toml` equals `$VERSION` (minus the `v`). If it does not match, STOP and report (do not guess a tag). -3. `git tag "$VERSION" && git push origin "$VERSION"` → triggers `release.yml`. -4. Report the pushed tag and remind the user to watch the Actions "Release" run for artifacts. +3. Guard against a double release: if `$VERSION` already exists as a tag + (`git tag -l "$VERSION"` non-empty, or `git ls-remote --tags origin "$VERSION"` non-empty), + STOP — the release was already cut. +4. `git tag "$VERSION" && git push origin "$VERSION"` → triggers `release.yml`. +5. Report the pushed tag and remind the user to watch the Actions "Release" run for artifacts. ## Part 4 — keep `develop` in sync (only if needed) If `master` ended up ahead of `develop` (e.g. a CHANGELOG/version edit merged only on master), From 981b4a8c49f3da2f757d151ba4e42442814972a5 Mon Sep 17 00:00:00 2001 From: flupkede <flupkede@users.noreply.github.com> Date: Mon, 1 Jun 2026 21:56:45 +0200 Subject: [PATCH 3/7] docs: document /merge and /release workflow in AGENTS.md Add a Release workflow section describing the two slash commands, the branch-protection rule, the tag-triggers-release.yml pipeline, and the feature-branch-only version-bump rule that fixes the release version. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- AGENTS.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index a6b967e..f239200 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -267,6 +267,27 @@ LMDB **does not allow** two `EnvOpenOptions::open()` handles on the same directo --- +## Release workflow — `/merge` and `/release` + +Two committed Claude Code slash commands codify the release process +(`.claude/commands/merge.md`, `.claude/commands/release.md`; force-added past `.gitignore`). + +- **`/merge`** — land the current feature branch on `develop`: README/CHANGELOG freshness + checks → commit → `cargo fmt`/`check`/`clippy` → push → PR to `develop` → `gh pr merge --auto` + (lands after CI). Does **not** tag. +- **`/release`** — `/merge`, then promote `develop` → `master` via a `Release vX.Y.Z` PR + (`protect-master.yml` allows PRs to `master` only from `develop` or `release/*`), then push + the `vX.Y.Z` tag that triggers `.github/workflows/release.yml` (6 archives, plain + + `-with-csharp`). Includes an optional post-release `master → develop` sync. + +**Version rule (encoded in the commands):** the `pre-commit` hook bumps the patch (+1) and +rebuilds **only on `feature/*` | `features/*` | `fix/*` branches**; `develop`/`master`/`release`/ +`chore` get `cargo fmt` only. So the release version is fixed at the feature-branch commit and +carries forward unchanged through develop, master, and the tag. `/merge` therefore aborts unless +run from a feature/fix branch. + +--- + ## Live Test Report — 2026-05-08 **Versie**: codesearch v1.0.93+416 From 884ccd65730b66dfbbed7f5f79d8efdd233ba86a Mon Sep 17 00:00:00 2001 From: flupkede <flupkede@users.noreply.github.com> Date: Mon, 1 Jun 2026 23:24:08 +0200 Subject: [PATCH 4/7] feat(chunker): semantic Markdown chunking via tree-sitter-md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Markdown and .txt files were indexed as a single whole-file block (the fallback chunker has no char budget), so a search hit returned an entire page — real Aprimo docs reached 80 KB in one chunk. Add the tree-sitter-md *block* grammar and chunk Markdown by heading section instead: each chunk is one heading plus its own prose/code, excluding nested subsections (which become their own chunks). The heading path is carried in the breadcrumb context (File > Title > Subsection) so embeddings capture each section's place in the document. Also add split_oversized, a char- and line-aware splitter for the unstructured paths (Markdown + the generic fallback): a single physical line longer than the char budget is hard-split on UTF-8 boundaries, so scraped one-line HTML/markdown can no longer produce an enormous chunk. The structured code path keeps using split_if_needed unchanged, so code chunking is unaffected. - Cargo.toml: add tree-sitter-md 0.5.3 - grammar.rs/language.rs: register Markdown as tree-sitter-supported - semantic.rs: chunk_markdown + emit_md_section + split_oversized - tests: section split, nested breadcrumbs, oversized + long-line splits Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- Cargo.lock | 13 +- Cargo.toml | 3 +- src/chunker/grammar.rs | 20 ++- src/chunker/parser.rs | 3 +- src/chunker/semantic.rs | 385 +++++++++++++++++++++++++++++++++++++++- src/file/language.rs | 5 +- 6 files changed, 421 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9d36a43..cdc968a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.142" +version = "1.0.143" dependencies = [ "anyhow", "arroy", @@ -686,6 +686,7 @@ dependencies = [ "tree-sitter-java", "tree-sitter-javascript", "tree-sitter-json", + "tree-sitter-md", "tree-sitter-php", "tree-sitter-python", "tree-sitter-ruby", @@ -5166,6 +5167,16 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" +[[package]] +name = "tree-sitter-md" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2efd398be546456c814598ee56c0f51769a77241511b4a58077815d120afa882" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-php" version = "0.24.2" diff --git a/Cargo.toml b/Cargo.toml index e41c0b9..014cd5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.142" +version = "1.0.143" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" @@ -52,6 +52,7 @@ tree-sitter-ruby = "0.23.1" tree-sitter-php = "0.24.2" tree-sitter-yaml = "0.7.2" tree-sitter-json = "0.24.8" +tree-sitter-md = "0.5.3" # File handling ignore = "0.4" diff --git a/src/chunker/grammar.rs b/src/chunker/grammar.rs index e891628..a6e7dfb 100644 --- a/src/chunker/grammar.rs +++ b/src/chunker/grammar.rs @@ -72,6 +72,11 @@ impl GrammarManager { Language::Php => Ok(tree_sitter_php::LANGUAGE_PHP.into()), Language::Yaml => Ok(tree_sitter_yaml::LANGUAGE.into()), Language::Json => Ok(tree_sitter_json::LANGUAGE.into()), + // Markdown uses the tree-sitter-md *block* grammar (sections, headings, + // code fences). The inline grammar is intentionally not used: chunk + // boundaries only need block structure, and the block grammar runs on a + // plain `Parser` like every other language here. + Language::Markdown => Ok(tree_sitter_md::LANGUAGE.into()), _ => Err(anyhow!( "Language {} does not support tree-sitter", language.name() @@ -96,6 +101,7 @@ impl GrammarManager { Language::Php, Language::Yaml, Language::Json, + Language::Markdown, ] } @@ -251,10 +257,19 @@ mod tests { } #[test] - fn test_unsupported_language() { + fn test_load_markdown_grammar() { let manager = GrammarManager::new(); let grammar = manager.get_grammar(Language::Markdown); + assert!(grammar.is_some()); + } + + #[test] + fn test_unsupported_language() { + let manager = GrammarManager::new(); + // Toml has no compiled-in grammar. + let grammar = manager.get_grammar(Language::Toml); + assert!(grammar.is_none()); } @@ -304,6 +319,7 @@ mod tests { assert!(manager.is_supported(Language::Php)); assert!(manager.is_supported(Language::Yaml)); assert!(manager.is_supported(Language::Json)); - assert!(!manager.is_supported(Language::Markdown)); + assert!(manager.is_supported(Language::Markdown)); + assert!(!manager.is_supported(Language::Toml)); } } diff --git a/src/chunker/parser.rs b/src/chunker/parser.rs index a4fc193..0efe8b3 100644 --- a/src/chunker/parser.rs +++ b/src/chunker/parser.rs @@ -280,7 +280,8 @@ fn baz() {} let mut parser = CodeParser::new(); let source = "some code"; - let result = parser.parse(Language::Markdown, source); + // Toml has no compiled-in grammar, so parsing must fail. + let result = parser.parse(Language::Toml, source); assert!(result.is_err()); } diff --git a/src/chunker/semantic.rs b/src/chunker/semantic.rs index b2557ab..1e061bb 100644 --- a/src/chunker/semantic.rs +++ b/src/chunker/semantic.rs @@ -42,12 +42,25 @@ impl SemanticChunker { path: &Path, content: &str, ) -> Result<Vec<Chunk>> { + // Markdown/txt are chunked by heading section rather than by definition + // node, so they take a dedicated path (no LanguageExtractor). + if language == Language::Markdown { + return self.chunk_markdown(path, content); + } + // 1. Check if we have an extractor for this language let extractor = match get_extractor(language) { Some(ext) => ext, None => { - // Fall back to simple chunking for unsupported languages - return Ok(self.fallback_chunk(path, content)); + // Fall back to simple chunking for unsupported languages. The + // line-windowed fallback ignores the char budget, so route its + // output through split_oversized to enforce max_chunk_chars and + // avoid pathological huge single chunks (e.g. minified one-line text). + return Ok(self + .fallback_chunk(path, content) + .into_iter() + .flat_map(|c| self.split_oversized(c)) + .collect()); } }; @@ -89,6 +102,180 @@ impl SemanticChunker { Ok(final_chunks) } + /// Chunk a Markdown/text file by heading section. + /// + /// Uses the tree-sitter-md *block* grammar: the document is a tree of nested + /// `section` nodes (one per heading). Each chunk is a single heading plus its + /// own prose/code, *excluding* nested subsections (which become their own + /// chunks). Heading text is carried in the breadcrumb context so the embedding + /// captures the section's place in the document (e.g. `File: x.md > Title > + /// Subsection`). Leading document content (YAML front-matter, prose before the + /// first heading) becomes a single preamble chunk. Oversized sections are + /// char/line-bounded via `split_if_needed`, and a file with no parseable + /// structure falls back to the line-windowed chunker (also bounded). + fn chunk_markdown(&mut self, path: &Path, content: &str) -> Result<Vec<Chunk>> { + let bounded_fallback = |this: &Self| -> Vec<Chunk> { + this.fallback_chunk(path, content) + .into_iter() + .flat_map(|c| this.split_oversized(c)) + .collect() + }; + + let parsed = match self.parser.parse(Language::Markdown, content) { + Ok(p) => p, + Err(_) => return Ok(bounded_fallback(self)), + }; + + let source = content.as_bytes(); + let path_str = normalize_path(path); + let file_context = format!("File: {}", path_str); + let root = parsed.root_node(); + + let mut cursor = root.walk(); + let top: Vec<Node> = root.named_children(&mut cursor).collect(); + + let mut chunks: Vec<Chunk> = Vec::new(); + + // Leading non-section nodes (front-matter / prose before the first heading) + // form a single preamble chunk. + let mut preamble_end = 0; + while preamble_end < top.len() && top[preamble_end].kind() != "section" { + preamble_end += 1; + } + if preamble_end > 0 { + let start_byte = top[0].start_byte(); + let end_byte = top[preamble_end - 1].end_byte(); + if let Some(chunk) = Self::md_chunk( + source, + start_byte, + end_byte, + top[0].start_position().row, + std::slice::from_ref(&file_context), + &path_str, + ) { + chunks.push(chunk); + } + } + + // Each top-level section (and, recursively, its subsections) becomes a chunk. + for node in top.iter().filter(|n| n.kind() == "section") { + self.emit_md_section( + *node, + source, + &path_str, + std::slice::from_ref(&file_context), + &mut chunks, + ); + } + + if chunks.is_empty() { + return Ok(bounded_fallback(self)); + } + + let source_lines: Vec<&str> = content.lines().collect(); + self.populate_context_windows(&mut chunks, &source_lines); + + let final_chunks = chunks + .into_iter() + .flat_map(|c| self.split_oversized(c)) + .collect(); + Ok(final_chunks) + } + + /// Emit a chunk for one `section` node (heading + direct content), then recurse + /// into nested subsections with an extended breadcrumb. + fn emit_md_section( + &self, + section: Node, + source: &[u8], + path_str: &str, + context_stack: &[String], + chunks: &mut Vec<Chunk>, + ) { + let mut cursor = section.walk(); + let children: Vec<Node> = section.named_children(&mut cursor).collect(); + + // Heading text (if the section opens with one) extends the breadcrumb. + let heading_text = children + .first() + .filter(|c| Self::md_is_heading(c.kind())) + .map(|h| Self::md_heading_text(*h, source)) + .unwrap_or_default(); + + let mut new_context = context_stack.to_vec(); + if !heading_text.is_empty() { + new_context.push(heading_text); + } + + // Direct content = section start .. first nested subsection (exclusive). + let first_sub = children.iter().find(|c| c.kind() == "section"); + let end_byte = first_sub.map_or_else(|| section.end_byte(), |s| s.start_byte()); + if let Some(chunk) = Self::md_chunk( + source, + section.start_byte(), + end_byte, + section.start_position().row, + &new_context, + path_str, + ) { + chunks.push(chunk); + } + + for child in children.iter().filter(|c| c.kind() == "section") { + self.emit_md_section(*child, source, path_str, &new_context, chunks); + } + } + + /// Build a Markdown chunk from a byte range, or None if it is blank. + fn md_chunk( + source: &[u8], + start_byte: usize, + end_byte: usize, + start_line: usize, + context: &[String], + path_str: &str, + ) -> Option<Chunk> { + let text = std::str::from_utf8(source.get(start_byte..end_byte)?).ok()?; + if text.trim().is_empty() { + return None; + } + let line_count = text.lines().count().max(1); + let mut chunk = Chunk::new( + text.to_string(), + start_line, + start_line + line_count, + ChunkKind::Block, + path_str.to_string(), + ); + chunk.context = context.to_vec(); + Some(chunk) + } + + /// True if a node kind is a Markdown heading. + fn md_is_heading(kind: &str) -> bool { + kind == "atx_heading" || kind == "setext_heading" + } + + /// Extract clean heading text (no `#` markers / underline). + fn md_heading_text(node: Node, source: &[u8]) -> String { + // atx_heading exposes the text via the `heading_content` field. + if let Some(inline) = node.child_by_field_name("heading_content") { + if let Ok(t) = inline.utf8_text(source) { + return t.trim().to_string(); + } + } + // Fallback (e.g. setext_heading): first line, stripped of '#'. + node.utf8_text(source) + .unwrap_or("") + .lines() + .next() + .unwrap_or("") + .trim() + .trim_matches('#') + .trim() + .to_string() + } + /// Populate context_prev and context_next for each chunk fn populate_context_windows(&self, chunks: &mut [Chunk], source_lines: &[&str]) { let total_lines = source_lines.len(); @@ -257,6 +444,88 @@ impl SemanticChunker { chunks } + /// Char- *and* line-aware splitter for unstructured text (Markdown/txt and the + /// generic fallback). Unlike `split_if_needed`, which windows purely by line + /// count, this also enforces `max_chunk_chars`: a single physical line longer + /// than the char budget is hard-split on UTF-8 boundaries. This is what keeps + /// scraped HTML/markdown — which can be one 80 KB line — from producing a single + /// enormous chunk. The structured code path keeps using `split_if_needed`, so + /// code chunking is unchanged. + fn split_oversized(&self, chunk: Chunk) -> Vec<Chunk> { + if chunk.line_count() <= self.max_chunk_lines && chunk.size_bytes() <= self.max_chunk_chars + { + return vec![chunk]; + } + + // 1. Expand into "units": one per line, but any line over the char budget is + // fragmented on char boundaries so no single unit exceeds max_chunk_chars. + let mut units: Vec<String> = Vec::new(); + for line in chunk.content.lines() { + if line.len() <= self.max_chunk_chars { + units.push(line.to_string()); + continue; + } + let mut frag = String::new(); + for ch in line.chars() { + if !frag.is_empty() && frag.len() + ch.len_utf8() > self.max_chunk_chars { + units.push(std::mem::take(&mut frag)); + } + frag.push(ch); + } + if !frag.is_empty() { + units.push(frag); + } + } + + if units.is_empty() { + return vec![chunk]; + } + + // 2. Greedily window units, bounded by both max_chunk_lines and + // max_chunk_chars. Windows advance without overlap (context_prev/next + // already supply surrounding lines), so no content is duplicated. + let mut out: Vec<Chunk> = Vec::new(); + let mut i = 0; + let mut split_index = 0; + while i < units.len() { + let mut j = i; + let mut char_count = 0usize; + while j < units.len() + && (j - i) < self.max_chunk_lines + && (j == i || char_count + units[j].len() < self.max_chunk_chars) + { + char_count += units[j].len() + 1; + j += 1; + } + let end = if j > i { j } else { i + 1 }; + + let content = units[i..end].join("\n"); + let mut piece = Chunk::new( + content, + chunk.start_line + i, + chunk.start_line + end, + chunk.kind, + chunk.path.clone(), + ); + piece.context = chunk.context.clone(); + piece.signature = chunk.signature.clone(); + piece.is_complete = false; + piece.split_index = Some(split_index); + out.push(piece); + + split_index += 1; + i = end; + } + + // A single resulting piece means no real split happened — keep it whole. + if out.len() == 1 { + out[0].is_complete = true; + out[0].split_index = None; + } + + out + } + /// Split a chunk if it exceeds size limits fn split_if_needed(&self, chunk: Chunk) -> Vec<Chunk> { let line_count = chunk.line_count(); @@ -586,6 +855,118 @@ class Calculator: ); } + #[test] + fn test_chunk_markdown_sections() { + let mut chunker = SemanticChunker::new(100, 2000, 10); + + let md = "---\nsource: dam_help\ntitle: E-mail ordering\nurl: https://help.aprimo.com/x\npath: dam_help/Ordering/EmailOrd\n---\n\n# E-mail ordering\n\nIntro paragraph about ordering.\n\n## Configure SMTP\n\nSteps to configure the mail server.\n\n## Troubleshooting\n\nFinal section text about errors.\n"; + + let path = Path::new("EmailOrd.md"); + let chunks = chunker + .chunk_semantic(Language::Markdown, path, md) + .unwrap(); + + // Preamble (front-matter) + h1 intro + 2 h2 sections = at least 4 chunks. + assert!( + chunks.len() >= 4, + "Expected >=4 section chunks, got {}", + chunks.len() + ); + + // No chunk should span the whole page: the "Configure SMTP" body and the + // "Troubleshooting" body must live in *different* chunks. + let smtp = chunks + .iter() + .find(|c| c.content.contains("Steps to configure")) + .expect("should have a Configure SMTP chunk"); + assert!( + !smtp.content.contains("Final section text"), + "sections must not be merged into a whole-page block" + ); + + // Breadcrumb context must carry the heading path (document title + section). + assert!(smtp.context.iter().any(|c| c.contains("E-mail ordering"))); + assert!(smtp.context.iter().any(|c| c.contains("Configure SMTP"))); + + // Every chunk stays within the char budget. + assert!(chunks.iter().all(|c| c.content.len() <= 2000)); + } + + #[test] + fn test_chunk_markdown_nested_breadcrumb() { + let mut chunker = SemanticChunker::new(100, 2000, 10); + let md = "# Top\n\nlead\n\n## Middle\n\nmid body\n\n### Deep\n\ndeep body here\n"; + let chunks = chunker + .chunk_semantic(Language::Markdown, Path::new("n.md"), md) + .unwrap(); + + let deep = chunks + .iter() + .find(|c| c.content.contains("deep body here")) + .expect("should find deep section"); + // File > Top > Middle > Deep + assert!(deep.context.iter().any(|c| c.contains("Top"))); + assert!(deep.context.iter().any(|c| c.contains("Middle"))); + assert!(deep.context.iter().any(|c| c.contains("Deep"))); + // The deep chunk must not contain its ancestors' bodies. + assert!(!deep.content.contains("mid body")); + assert!(!deep.content.contains("lead")); + } + + #[test] + fn test_chunk_markdown_oversized_section_split() { + let mut chunker = SemanticChunker::new(100, 200, 5); + let big_body = (0..50) + .map(|i| format!("line of section body number {}", i)) + .collect::<Vec<_>>() + .join("\n"); + let md = format!("# Heading\n\n{}\n", big_body); + + let chunks = chunker + .chunk_semantic(Language::Markdown, Path::new("big.md"), &md) + .unwrap(); + + // A single >200-char section must be split into multiple bounded parts. + assert!(chunks.len() > 1, "oversized section should be split"); + assert!(chunks.iter().any(|c| !c.is_complete)); + } + + #[test] + fn test_chunk_markdown_hard_splits_long_line() { + // Mirrors real scraped Aprimo docs: a section whose body is ONE huge line + // (no internal newlines). Line-based splitting can't bound this; the + // char-aware splitter must. + let mut chunker = SemanticChunker::new(100, 500, 10); + let long_line = "word ".repeat(2000); // ~10_000 chars, single line + let md = format!("# Title\n\n{}\n", long_line); + + let chunks = chunker + .chunk_semantic(Language::Markdown, Path::new("huge.md"), &md) + .unwrap(); + + assert!(chunks.len() > 1, "a single 10KB line must be hard-split"); + assert!( + chunks.iter().all(|c| c.content.len() <= 500), + "every piece must respect the char budget; got max {}", + chunks.iter().map(|c| c.content.len()).max().unwrap_or(0) + ); + } + + #[test] + fn test_chunk_markdown_no_headings_falls_back() { + let mut chunker = SemanticChunker::new(100, 2000, 10); + let md = "Just some plain text\nwith a few lines\nand no headings at all.\n"; + let chunks = chunker + .chunk_semantic(Language::Markdown, Path::new("plain.txt"), md) + .unwrap(); + + assert!(!chunks.is_empty()); + // All content is preserved across chunks. + let joined: String = chunks.iter().map(|c| c.content.clone()).collect(); + assert!(joined.contains("plain text")); + assert!(joined.contains("no headings")); + } + #[test] fn test_chunk_unsupported_language() { let mut chunker = SemanticChunker::new(100, 2000, 10); diff --git a/src/file/language.rs b/src/file/language.rs index cd2f310..f164a23 100644 --- a/src/file/language.rs +++ b/src/file/language.rs @@ -105,6 +105,7 @@ impl Language { | Self::Php | Self::Yaml | Self::Json + | Self::Markdown ) } @@ -176,7 +177,9 @@ mod tests { assert!(Language::Python.supports_tree_sitter()); assert!(Language::TypeScript.supports_tree_sitter()); assert!(Language::Json.supports_tree_sitter()); - assert!(!Language::Markdown.supports_tree_sitter()); + assert!(Language::Markdown.supports_tree_sitter()); + // Toml has no tree-sitter grammar yet. + assert!(!Language::Toml.supports_tree_sitter()); } #[test] From 1dd9cac3160a96707ae679a09bd3be32435695df Mon Sep 17 00:00:00 2001 From: flupkede <flupkede@users.noreply.github.com> Date: Mon, 1 Jun 2026 23:57:38 +0200 Subject: [PATCH 5/7] [worker] final review: fix chunk_markdown doc comment Reference the actual splitter used by the markdown path (split_oversized, char-aware) instead of split_if_needed (the code path's line-based splitter). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/chunker/semantic.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cdc968a..bd5d932 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.143" +version = "1.0.144" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index 014cd5c..aba7d4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.143" +version = "1.0.144" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/chunker/semantic.rs b/src/chunker/semantic.rs index 1e061bb..6f30284 100644 --- a/src/chunker/semantic.rs +++ b/src/chunker/semantic.rs @@ -111,7 +111,7 @@ impl SemanticChunker { /// captures the section's place in the document (e.g. `File: x.md > Title > /// Subsection`). Leading document content (YAML front-matter, prose before the /// first heading) becomes a single preamble chunk. Oversized sections are - /// char/line-bounded via `split_if_needed`, and a file with no parseable + /// char/line-bounded via `split_oversized`, and a file with no parseable /// structure falls back to the line-windowed chunker (also bounded). fn chunk_markdown(&mut self, path: &Path, content: &str) -> Result<Vec<Chunk>> { let bounded_fallback = |this: &Self| -> Vec<Chunk> { From 7a055663a6240e245e490bd62cbbec89dff0f0cc Mon Sep 17 00:00:00 2001 From: flupkede <flupkede@users.noreply.github.com> Date: Tue, 2 Jun 2026 00:26:24 +0200 Subject: [PATCH 6/7] docs: document semantic Markdown chunking + correct language table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CHANGELOG: add [1.0.145] entry for tree-sitter-md block-grammar Markdown chunking (sections/headings/code fences). - README: expand the Supported Languages table to all 15 tree-sitter languages and bump the "9 languages" count to 15 — correcting pre-existing drift that omitted Shell, Ruby, PHP, YAML, JSON, and (new) Markdown. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- CHANGELOG.md | 16 ++++++++++++++++ Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 21 ++++++++++++++------- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c4a3d2..5a92a5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +## [1.0.145] - 2026-06-02 + +### Added + +- **Semantic Markdown chunking** — Markdown files (`.md`, `.markdown`, `.txt`) are + now parsed with the **tree-sitter-md block grammar**, so chunks align to sections, + headings, and code fences instead of arbitrary line ranges. `Language::Markdown` + now reports `supports_tree_sitter() == true` and has a compiled-in grammar. + +### Changed + +- **Supported-languages documentation corrected** — the README language table now + lists all 15 tree-sitter languages actually supported (Rust, Python, JavaScript, + TypeScript, C, C++, C#, Go, Java, Shell, Ruby, PHP, YAML, JSON, Markdown); + it previously showed only 9, omitting Shell, Ruby, PHP, YAML, JSON, and Markdown. + ## [1.0.142] - 2026-06-01 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index bd5d932..a2d255e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.144" +version = "1.0.145" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index aba7d4e..b15568c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.144" +version = "1.0.145" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/README.md b/README.md index e4afec5..8ee61fe 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ codesearch gives AI agents (OpenCode, Claude Code, Cursor, and any MCP client) d - **Multi-repo serve mode**: Fan-out queries across repository groups with cross-repo RRF ranking - **Hybrid retrieval**: Vector embeddings + BM25 full-text search fused with Reciprocal Rank Fusion - **Symbol navigation**: Jump to definitions, find usages, trace imports and dependents — in the same tool -- **AST-aware chunking**: Tree-sitter parsing for 9 languages — chunks align to functions/classes, not arbitrary line ranges +- **AST-aware chunking**: Tree-sitter parsing for 15 languages — chunks align to functions/classes (and Markdown sections), not arbitrary line ranges - **Token-efficient**: Returns metadata by default; agents fetch full code only when needed via `get_chunk` - **Lightweight footprint**: Hundreds of MB on disk, runs on CPU only, no runtime model downloads (works behind enterprise proxies) - **Zero config for single repos**: `codesearch index && codesearch mcp` — done @@ -410,16 +410,23 @@ Tree-sitter AST-aware chunking: | Language | Extensions | |----------|-----------| | Rust | `.rs` | -| Python | `.py` | -| JavaScript | `.js`, `.jsx` | -| TypeScript | `.ts`, `.tsx` | +| Python | `.py`, `.pyw`, `.pyi` | +| JavaScript | `.js`, `.mjs`, `.cjs` | +| TypeScript | `.ts`, `.tsx`, `.jsx`, `.mts`, `.cts` | | C | `.c`, `.h` | -| C++ | `.cpp`, `.hpp` | +| C++ | `.cpp`, `.cc`, `.cxx`, `.hpp`, `.hxx` | | C# | `.cs` | | Go | `.go` | | Java | `.java` | - -All other text files use line-based chunking as fallback. +| Shell | `.sh`, `.bash`, `.zsh` | +| Ruby | `.rb`, `.rake` | +| PHP | `.php` | +| YAML | `.yaml`, `.yml` | +| JSON | `.json` | +| Markdown | `.md`, `.markdown`, `.txt` | + +Markdown uses the tree-sitter-md **block** grammar — chunks align to sections, +headings, and code fences. All other text files use line-based chunking as fallback. ## Core Technology From cda4a5d49c97aa2ebe9dae1711acb465ae5d1c91 Mon Sep 17 00:00:00 2001 From: flupkede <flupkede@users.noreply.github.com> Date: Tue, 2 Jun 2026 00:40:26 +0200 Subject: [PATCH 7/7] fix(test): sanitize customer ref in markdown chunking fixtures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pre-push customer-ref guard flagged "aprimo" in two semantic.rs test fixtures (a frontmatter URL and a comment). Replaced with generic example.com / "real-world scraped docs" — the test assertions never reference either, so behavior is unchanged. Realign CHANGELOG heading to the post-bump version (1.0.146). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --- CHANGELOG.md | 2 +- Cargo.lock | 2 +- Cargo.toml | 2 +- src/chunker/semantic.rs | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a92a5a..acb6322 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 -## [1.0.145] - 2026-06-02 +## [1.0.146] - 2026-06-02 ### Added diff --git a/Cargo.lock b/Cargo.lock index a2d255e..c22ffaa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -628,7 +628,7 @@ dependencies = [ [[package]] name = "codesearch" -version = "1.0.145" +version = "1.0.146" dependencies = [ "anyhow", "arroy", diff --git a/Cargo.toml b/Cargo.toml index b15568c..004caa1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codesearch" -version = "1.0.145" +version = "1.0.146" edition = "2021" authors = ["codesearch contributors"] license = "Apache-2.0" diff --git a/src/chunker/semantic.rs b/src/chunker/semantic.rs index 6f30284..3b8bc5a 100644 --- a/src/chunker/semantic.rs +++ b/src/chunker/semantic.rs @@ -859,7 +859,7 @@ class Calculator: fn test_chunk_markdown_sections() { let mut chunker = SemanticChunker::new(100, 2000, 10); - let md = "---\nsource: dam_help\ntitle: E-mail ordering\nurl: https://help.aprimo.com/x\npath: dam_help/Ordering/EmailOrd\n---\n\n# E-mail ordering\n\nIntro paragraph about ordering.\n\n## Configure SMTP\n\nSteps to configure the mail server.\n\n## Troubleshooting\n\nFinal section text about errors.\n"; + let md = "---\nsource: dam_help\ntitle: E-mail ordering\nurl: https://help.example.com/x\npath: dam_help/Ordering/EmailOrd\n---\n\n# E-mail ordering\n\nIntro paragraph about ordering.\n\n## Configure SMTP\n\nSteps to configure the mail server.\n\n## Troubleshooting\n\nFinal section text about errors.\n"; let path = Path::new("EmailOrd.md"); let chunks = chunker @@ -933,7 +933,7 @@ class Calculator: #[test] fn test_chunk_markdown_hard_splits_long_line() { - // Mirrors real scraped Aprimo docs: a section whose body is ONE huge line + // Mirrors real-world scraped docs: a section whose body is ONE huge line // (no internal newlines). Line-based splitting can't bound this; the // char-aware splitter must. let mut chunker = SemanticChunker::new(100, 500, 10);