From 0c0c7e976ac063b40f9f0f1eaf48b6bd5273ce18 Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Mon, 1 Jun 2026 21:44:33 +0200
Subject: [PATCH 1/7] chore: add /merge and /release Claude Code slash commands

Codify the project release workflow as two committed slash commands under
.claude/commands/ (force-added past .gitignore, like .claude/CLAUDE.md):

- /merge: README/CHANGELOG freshness checks -> commit -> validate -> push ->
  PR to develop -> auto-merge after CI. No tag.
- /release: /merge, then promote develop -> master via a "Release vX.Y.Z" PR
  (protect-master allows develop), then push the vX.Y.Z tag that triggers
  release.yml. Includes optional post-release develop sync.

Commands document the repo's real conventions: feature->develop->master flow,
master branch protection, and the pre-commit version-bump-on-feature-branches
rule that fixes the release version at the feature commit.

Tooling-only change on a chore/ branch: no version bump, no CHANGELOG entry
(CHANGELOG tracks the shipped binary's behavior).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/commands/merge.md   | 79 +++++++++++++++++++++++++++++++++++++
 .claude/commands/release.md | 59 +++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 .claude/commands/merge.md
 create mode 100644 .claude/commands/release.md

diff --git a/.claude/commands/merge.md b/.claude/commands/merge.md
new file mode 100644
index 0000000..5ceff74
--- /dev/null
+++ b/.claude/commands/merge.md
@@ -0,0 +1,79 @@
+---
+description: Land the current feature branch on develop — README/CHANGELOG checks, commit, push, PR, auto-merge
+argument-hint: [optional PR title]
+allowed-tools: Bash(git:*), Bash(gh:*), Bash(cargo:*), Bash(grep:*), Read, Edit, Grep, Glob
+---
+
+# /merge — land the current feature branch on `develop`
+
+Run the project's **merge workflow**: verify docs are current, then bring the current
+feature branch into `develop` through a pull request. This command does **not** tag a
+release — tagging happens only in `/release`.
+
+## Branch & version facts (this repo)
+- Flow: `feature/*` | `features/*` | `fix/*` → PR → **`develop`** → (later) PR → **`master`**.
+- `master` is protected (`.github/workflows/protect-master.yml`): it accepts PRs only from
+  `develop` or `release/*`.
+- The pre-commit hook **bumps the patch version (+1) and rebuilds the binary on feature
+  branches only** (`feature/*`, `features/*`, `fix/*`). On `develop`/`master`/`release`/`chore`
+  it runs `cargo fmt` only — no bump. So **the feature-branch commit here fixes the release
+  version**; it carries forward unchanged through develop, master, and the tag.
+
+## Guardrails
+- ABORT if the current branch is `develop` or `master`. This command runs from a feature branch.
+- NEVER push directly to `develop` or `master` — everything lands via a PR.
+- NEVER pass `--no-verify` / `--no-gpg-sign` — let the pre-commit hook run (it bumps + rebuilds).
+- Do NOT create or push a tag here. That is `/release`'s job.
+- Do NOT force-push.
+
+## Steps
+
+1. **Context**
+   - `git rev-parse --abbrev-ref HEAD` → current branch. If `develop`/`master`, STOP with an error.
+   - `git fetch origin`.
+   - Compute the change set landing on develop: `git log origin/develop..HEAD --oneline`
+     plus `git status --short` for uncommitted work. If there is nothing to land, report and STOP.
+
+2. **README up to date?**
+   - Inspect the change set for user-facing changes: new/removed CLI flags or subcommands,
+     behavior changes, new env vars, new supported languages, new MCP tools.
+   - Compare against `README.md`. If anything is missing, wrong, or stale, **UPDATE `README.md`**
+     so it matches reality. Keep examples free of hardcoded config strings (per CLAUDE.md).
+   - If README already matches, state that and move on.
+
+3. **CHANGELOG up to date?**
+   - Ensure `CHANGELOG.md` has an entry for this change under a `## [X.Y.Z] - YYYY-MM-DD`
+     heading with `Added` / `Changed` / `Fixed` subsections describing every user-facing change.
+   - **Version for the heading** = current `Cargo.toml` version **+ 1 patch** (the hook will bump
+     to that value on commit). Read it: `grep -m1 '^version' Cargo.toml`.
+   - Use today's date. If an accurate entry already exists for the pending version, leave it.
+
+4. **Commit**
+   - Stage code + doc changes (`git add -A`, plus `git add -f` for any tracked-but-gitignored file).
+   - Commit with a clear, scoped message. End the message with:
+     `Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>`
+   - Let the pre-commit hook finish (fmt → version bump → rebuild). This can take 60–120s.
+
+5. **Validate** (fast loop, per CLAUDE.md — do NOT run `--release`):
+   - `cargo fmt --all -- --check`
+   - `cargo check --all-targets`
+   - `cargo clippy --all-targets -- -D warnings`
+   - Fix any failures and commit again before pushing. Never push code that fails these.
+
+6. **Push**
+   - `git push -u origin HEAD`.
+
+7. **Open PR → develop**
+   - `gh pr create --base develop --head <branch> --title "<title>" --body "<body>"`.
+   - Title: use `$ARGUMENTS` if provided; otherwise summarize the branch concisely.
+   - Body: bullet summary of changes; end with:
+     `🤖 Generated with [Claude Code](https://claude.com/claude-code)`.
+
+8. **Auto-merge after CI**
+   - `gh pr merge --auto --merge` so the PR lands automatically once required checks pass.
+   - If auto-merge is not enabled on the repo (command errors), fall back: poll
+     `gh pr checks <num> --watch`, then `gh pr merge --merge` once green.
+
+## Report
+Branch, pending release version, doc updates made, PR URL, and merge status
+(auto-merge enabled / merged).
diff --git a/.claude/commands/release.md b/.claude/commands/release.md
new file mode 100644
index 0000000..822d310
--- /dev/null
+++ b/.claude/commands/release.md
@@ -0,0 +1,59 @@
+---
+description: Cut a release — run /merge (feature → develop), then promote develop → master and push the version tag
+argument-hint: [optional PR/release title]
+allowed-tools: Bash(git:*), Bash(gh:*), Bash(cargo:*), Bash(grep:*), Read, Edit, Grep, Glob
+---
+
+# /release — full release: land on `develop`, promote to `master`, tag
+
+This is `/merge` **plus** the `develop → master` promotion and the version-tag push that
+triggers the build/publish pipeline.
+
+## Branch & version facts (this repo)
+- Flow: `feature/*` → PR → **`develop`** → PR → **`master`** → push tag `vX.Y.Z`.
+- `master` is protected: PRs to it may come **only** from `develop` or `release/*`
+  (`.github/workflows/protect-master.yml`).
+- Pushing a `vX.Y.Z` tag triggers `.github/workflows/release.yml` (builds Windows/Linux/macOS
+  archives, plain + `-with-csharp`, and publishes the GitHub release). **Push the tag only
+  AFTER the develop→master PR has merged.**
+- The version is fixed by the feature-branch commit (the pre-commit hook bumps only on
+  feature branches). develop/master merges and the tag all carry that same version.
+
+## Guardrails
+- NEVER use `--no-verify`. NEVER force-push shared branches.
+- Push the tag exactly once, only after master has the release commit.
+- If CI fails at any gate, STOP and report — do not promote or tag a red build.
+
+## Part 1 — land on `develop` (the `/merge` workflow)
+Execute every step of **`/merge`** (README/CHANGELOG checks → commit → push → PR → auto-merge
+to `develop`). Then **wait for the develop PR to actually merge** (auto-merge waits on CI):
+- Get the PR number, then poll `gh pr view <num> --json state,mergedAt,mergeStateStatus`
+  until `state` is `MERGED`.
+- If checks fail, STOP and report. Do not proceed to Part 2.
+
+## Part 2 — promote `develop` → `master`
+1. `git fetch origin && git checkout develop && git pull --ff-only origin develop`.
+2. Determine the release version: `VERSION=v$(grep -m1 '^version' Cargo.toml | sed -E 's/.*"(.+)".*/\1/')`.
+3. Open the release PR (source `develop`, which protect-master allows):
+   - `gh pr create --base master --head develop --title "Release $VERSION — <summary>" --body "<body>"`.
+   - Title: prefix `Release $VERSION — ` then a short summary (or `$ARGUMENTS` if provided),
+     matching history (e.g. `Release v1.0.142 — serve responsive during warmup`).
+   - Body ends with: `🤖 Generated with [Claude Code](https://claude.com/claude-code)`.
+4. `gh pr merge --auto --merge`. Wait until `state` is `MERGED` (poll as in Part 1).
+   If auto-merge is unavailable, `gh pr checks <num> --watch` then `gh pr merge --merge`.
+   If CI fails, STOP.
+
+## Part 3 — tag the release
+1. `git fetch origin && git checkout master && git pull --ff-only origin master`.
+2. Confirm the version on master matches: `grep -m1 '^version' Cargo.toml` equals `$VERSION` (minus the `v`).
+   If it does not match, STOP and report (do not guess a tag).
+3. `git tag "$VERSION" && git push origin "$VERSION"` → triggers `release.yml`.
+4. Report the pushed tag and remind the user to watch the Actions "Release" run for artifacts.
+
+## Part 4 — keep `develop` in sync (only if needed)
+If `master` ended up ahead of `develop` (e.g. a CHANGELOG/version edit merged only on master),
+open a sync PR `master → develop` (or fast-forward develop) — matching the repo's post-release
+sync convention (e.g. PR #90 "sync: backfill CHANGELOG … from master"). Skip if already in sync.
+
+## Report
+develop PR URL, release PR URL, tag pushed (`vX.Y.Z`), final version, and sync action (if any).

From ea183c80f2d9c57d5f7d4502485fb7dd2c8b4ad7 Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Mon, 1 Jun 2026 21:47:37 +0200
Subject: [PATCH 2/7] chore: address review remarks on /merge and /release
 commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- /merge: abort unless on feature/*|features/*|fix/* (the only branches the
  pre-commit hook version-bumps) — closes the gap where running from a
  non-bumping branch silently broke the version/CHANGELOG premise.
- Clarify CHANGELOG heading version math for multi-commit landings (hook bumps
  +1 per commit; verify heading matches Cargo.toml after the final commit).
- Capture PR numbers explicitly (gh pr view --json number) before merge/poll.
- /release: fetch --tags and guard against a double release (stop if the tag
  already exists locally or on origin).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/commands/merge.md   | 22 ++++++++++++++++------
 .claude/commands/release.md | 20 ++++++++++++--------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/.claude/commands/merge.md b/.claude/commands/merge.md
index 5ceff74..3931644 100644
--- a/.claude/commands/merge.md
+++ b/.claude/commands/merge.md
@@ -20,7 +20,10 @@ release — tagging happens only in `/release`.
   version**; it carries forward unchanged through develop, master, and the tag.
 
 ## Guardrails
-- ABORT if the current branch is `develop` or `master`. This command runs from a feature branch.
+- ABORT unless the current branch matches `feature/*`, `features/*`, or `fix/*` — i.e. the
+  branches the pre-commit hook version-bumps. Never run from `develop`, `master`, `release/*`,
+  or `chore/*`: on those the hook does **not** bump, so the version/CHANGELOG premise below
+  would silently break.
 - NEVER push directly to `develop` or `master` — everything lands via a PR.
 - NEVER pass `--no-verify` / `--no-gpg-sign` — let the pre-commit hook run (it bumps + rebuilds).
 - Do NOT create or push a tag here. That is `/release`'s job.
@@ -29,7 +32,8 @@ release — tagging happens only in `/release`.
 ## Steps
 
 1. **Context**
-   - `git rev-parse --abbrev-ref HEAD` → current branch. If `develop`/`master`, STOP with an error.
+   - `git rev-parse --abbrev-ref HEAD` → current branch. If it is NOT `feature/*`, `features/*`,
+     or `fix/*`, STOP with an error (see Guardrails).
    - `git fetch origin`.
    - Compute the change set landing on develop: `git log origin/develop..HEAD --oneline`
      plus `git status --short` for uncommitted work. If there is nothing to land, report and STOP.
@@ -44,8 +48,12 @@ release — tagging happens only in `/release`.
 3. **CHANGELOG up to date?**
    - Ensure `CHANGELOG.md` has an entry for this change under a `## [X.Y.Z] - YYYY-MM-DD`
      heading with `Added` / `Changed` / `Fixed` subsections describing every user-facing change.
-   - **Version for the heading** = current `Cargo.toml` version **+ 1 patch** (the hook will bump
-     to that value on commit). Read it: `grep -m1 '^version' Cargo.toml`.
+   - **Version for the heading**: the hook bumps the patch by +1 on **every** feature-branch
+     commit where the working-tree version still equals HEAD's. The most reliable approach is to
+     land this branch in a **single commit** — then the heading version = current
+     `Cargo.toml` version + 1 (`grep -m1 '^version' Cargo.toml`). If you commit more than once,
+     the version advances once per commit; after the final commit, read the actual
+     `Cargo.toml` version and make sure the CHANGELOG heading matches it (fix it if not).
    - Use today's date. If an accurate entry already exists for the pending version, leave it.
 
 4. **Commit**
@@ -68,11 +76,13 @@ release — tagging happens only in `/release`.
    - Title: use `$ARGUMENTS` if provided; otherwise summarize the branch concisely.
    - Body: bullet summary of changes; end with:
      `🤖 Generated with [Claude Code](https://claude.com/claude-code)`.
+   - Capture the PR number for the next step:
+     `PR=$(gh pr view --json number --jq .number)`.
 
 8. **Auto-merge after CI**
-   - `gh pr merge --auto --merge` so the PR lands automatically once required checks pass.
+   - `gh pr merge "$PR" --auto --merge` so the PR lands automatically once required checks pass.
    - If auto-merge is not enabled on the repo (command errors), fall back: poll
-     `gh pr checks <num> --watch`, then `gh pr merge --merge` once green.
+     `gh pr checks "$PR" --watch`, then `gh pr merge "$PR" --merge` once green.
 
 ## Report
 Branch, pending release version, doc updates made, PR URL, and merge status
diff --git a/.claude/commands/release.md b/.claude/commands/release.md
index 822d310..16f2641 100644
--- a/.claude/commands/release.md
+++ b/.claude/commands/release.md
@@ -27,8 +27,8 @@ triggers the build/publish pipeline.
 ## Part 1 — land on `develop` (the `/merge` workflow)
 Execute every step of **`/merge`** (README/CHANGELOG checks → commit → push → PR → auto-merge
 to `develop`). Then **wait for the develop PR to actually merge** (auto-merge waits on CI):
-- Get the PR number, then poll `gh pr view <num> --json state,mergedAt,mergeStateStatus`
-  until `state` is `MERGED`.
+- Capture the PR number (`PR=$(gh pr view --json number --jq .number)`), then poll
+  `gh pr view "$PR" --json state,mergedAt,mergeStateStatus` until `state` is `MERGED`.
 - If checks fail, STOP and report. Do not proceed to Part 2.
 
 ## Part 2 — promote `develop` → `master`
@@ -39,16 +39,20 @@ to `develop`). Then **wait for the develop PR to actually merge** (auto-merge wa
    - Title: prefix `Release $VERSION — ` then a short summary (or `$ARGUMENTS` if provided),
      matching history (e.g. `Release v1.0.142 — serve responsive during warmup`).
    - Body ends with: `🤖 Generated with [Claude Code](https://claude.com/claude-code)`.
-4. `gh pr merge --auto --merge`. Wait until `state` is `MERGED` (poll as in Part 1).
-   If auto-merge is unavailable, `gh pr checks <num> --watch` then `gh pr merge --merge`.
-   If CI fails, STOP.
+   - Capture the PR number: `RELEASE_PR=$(gh pr view develop --json number --jq .number)`.
+4. `gh pr merge "$RELEASE_PR" --auto --merge`. Wait until `state` is
+   `MERGED` (poll as in Part 1). If auto-merge is unavailable, `gh pr checks "$RELEASE_PR" --watch`
+   then `gh pr merge "$RELEASE_PR" --merge`. If CI fails, STOP.
 
 ## Part 3 — tag the release
-1. `git fetch origin && git checkout master && git pull --ff-only origin master`.
+1. `git fetch origin --tags && git checkout master && git pull --ff-only origin master`.
 2. Confirm the version on master matches: `grep -m1 '^version' Cargo.toml` equals `$VERSION` (minus the `v`).
    If it does not match, STOP and report (do not guess a tag).
-3. `git tag "$VERSION" && git push origin "$VERSION"` → triggers `release.yml`.
-4. Report the pushed tag and remind the user to watch the Actions "Release" run for artifacts.
+3. Guard against a double release: if `$VERSION` already exists as a tag
+   (`git tag -l "$VERSION"` non-empty, or `git ls-remote --tags origin "$VERSION"` non-empty),
+   STOP — the release was already cut.
+4. `git tag "$VERSION" && git push origin "$VERSION"` → triggers `release.yml`.
+5. Report the pushed tag and remind the user to watch the Actions "Release" run for artifacts.
 
 ## Part 4 — keep `develop` in sync (only if needed)
 If `master` ended up ahead of `develop` (e.g. a CHANGELOG/version edit merged only on master),

From 981b4a8c49f3da2f757d151ba4e42442814972a5 Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Mon, 1 Jun 2026 21:56:45 +0200
Subject: [PATCH 3/7] docs: document /merge and /release workflow in AGENTS.md

Add a Release workflow section describing the two slash commands, the
branch-protection rule, the tag-triggers-release.yml pipeline, and the
feature-branch-only version-bump rule that fixes the release version.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 AGENTS.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index a6b967e..f239200 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -267,6 +267,27 @@ LMDB **does not allow** two `EnvOpenOptions::open()` handles on the same directo
 
 ---
 
+## Release workflow — `/merge` and `/release`
+
+Two committed Claude Code slash commands codify the release process
+(`.claude/commands/merge.md`, `.claude/commands/release.md`; force-added past `.gitignore`).
+
+- **`/merge`** — land the current feature branch on `develop`: README/CHANGELOG freshness
+  checks → commit → `cargo fmt`/`check`/`clippy` → push → PR to `develop` → `gh pr merge --auto`
+  (lands after CI). Does **not** tag.
+- **`/release`** — `/merge`, then promote `develop` → `master` via a `Release vX.Y.Z` PR
+  (`protect-master.yml` allows PRs to `master` only from `develop` or `release/*`), then push
+  the `vX.Y.Z` tag that triggers `.github/workflows/release.yml` (6 archives, plain +
+  `-with-csharp`). Includes an optional post-release `master → develop` sync.
+
+**Version rule (encoded in the commands):** the `pre-commit` hook bumps the patch (+1) and
+rebuilds **only on `feature/*` | `features/*` | `fix/*` branches**; `develop`/`master`/`release`/
+`chore` get `cargo fmt` only. So the release version is fixed at the feature-branch commit and
+carries forward unchanged through develop, master, and the tag. `/merge` therefore aborts unless
+run from a feature/fix branch.
+
+---
+
 ## Live Test Report — 2026-05-08
 
 **Versie**: codesearch v1.0.93+416  

From 884ccd65730b66dfbbed7f5f79d8efdd233ba86a Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Mon, 1 Jun 2026 23:24:08 +0200
Subject: [PATCH 4/7] feat(chunker): semantic Markdown chunking via
 tree-sitter-md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Markdown and .txt files were indexed as a single whole-file block (the
fallback chunker has no char budget), so a search hit returned an entire
page — real Aprimo docs reached 80 KB in one chunk.

Add the tree-sitter-md *block* grammar and chunk Markdown by heading
section instead: each chunk is one heading plus its own prose/code,
excluding nested subsections (which become their own chunks). The
heading path is carried in the breadcrumb context (File > Title >
Subsection) so embeddings capture each section's place in the document.

Also add split_oversized, a char- and line-aware splitter for the
unstructured paths (Markdown + the generic fallback): a single physical
line longer than the char budget is hard-split on UTF-8 boundaries, so
scraped one-line HTML/markdown can no longer produce an enormous chunk.
The structured code path keeps using split_if_needed unchanged, so code
chunking is unaffected.

- Cargo.toml: add tree-sitter-md 0.5.3
- grammar.rs/language.rs: register Markdown as tree-sitter-supported
- semantic.rs: chunk_markdown + emit_md_section + split_oversized
- tests: section split, nested breadcrumbs, oversized + long-line splits

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock              |  13 +-
 Cargo.toml              |   3 +-
 src/chunker/grammar.rs  |  20 ++-
 src/chunker/parser.rs   |   3 +-
 src/chunker/semantic.rs | 385 +++++++++++++++++++++++++++++++++++++++-
 src/file/language.rs    |   5 +-
 6 files changed, 421 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9d36a43..cdc968a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -628,7 +628,7 @@ dependencies = [
 
 [[package]]
 name = "codesearch"
-version = "1.0.142"
+version = "1.0.143"
 dependencies = [
  "anyhow",
  "arroy",
@@ -686,6 +686,7 @@ dependencies = [
  "tree-sitter-java",
  "tree-sitter-javascript",
  "tree-sitter-json",
+ "tree-sitter-md",
  "tree-sitter-php",
  "tree-sitter-python",
  "tree-sitter-ruby",
@@ -5166,6 +5167,16 @@ version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
 
+[[package]]
+name = "tree-sitter-md"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2efd398be546456c814598ee56c0f51769a77241511b4a58077815d120afa882"
+dependencies = [
+ "cc",
+ "tree-sitter-language",
+]
+
 [[package]]
 name = "tree-sitter-php"
 version = "0.24.2"
diff --git a/Cargo.toml b/Cargo.toml
index e41c0b9..014cd5c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codesearch"
-version = "1.0.142"
+version = "1.0.143"
 edition = "2021"
 authors = ["codesearch contributors"]
 license = "Apache-2.0"
@@ -52,6 +52,7 @@ tree-sitter-ruby = "0.23.1"
 tree-sitter-php = "0.24.2"
 tree-sitter-yaml = "0.7.2"
 tree-sitter-json = "0.24.8"
+tree-sitter-md = "0.5.3"
 
 # File handling
 ignore = "0.4"
diff --git a/src/chunker/grammar.rs b/src/chunker/grammar.rs
index e891628..a6e7dfb 100644
--- a/src/chunker/grammar.rs
+++ b/src/chunker/grammar.rs
@@ -72,6 +72,11 @@ impl GrammarManager {
             Language::Php => Ok(tree_sitter_php::LANGUAGE_PHP.into()),
             Language::Yaml => Ok(tree_sitter_yaml::LANGUAGE.into()),
             Language::Json => Ok(tree_sitter_json::LANGUAGE.into()),
+            // Markdown uses the tree-sitter-md *block* grammar (sections, headings,
+            // code fences). The inline grammar is intentionally not used: chunk
+            // boundaries only need block structure, and the block grammar runs on a
+            // plain `Parser` like every other language here.
+            Language::Markdown => Ok(tree_sitter_md::LANGUAGE.into()),
             _ => Err(anyhow!(
                 "Language {} does not support tree-sitter",
                 language.name()
@@ -96,6 +101,7 @@ impl GrammarManager {
             Language::Php,
             Language::Yaml,
             Language::Json,
+            Language::Markdown,
         ]
     }
 
@@ -251,10 +257,19 @@ mod tests {
     }
 
     #[test]
-    fn test_unsupported_language() {
+    fn test_load_markdown_grammar() {
         let manager = GrammarManager::new();
         let grammar = manager.get_grammar(Language::Markdown);
 
+        assert!(grammar.is_some());
+    }
+
+    #[test]
+    fn test_unsupported_language() {
+        let manager = GrammarManager::new();
+        // Toml has no compiled-in grammar.
+        let grammar = manager.get_grammar(Language::Toml);
+
         assert!(grammar.is_none());
     }
 
@@ -304,6 +319,7 @@ mod tests {
         assert!(manager.is_supported(Language::Php));
         assert!(manager.is_supported(Language::Yaml));
         assert!(manager.is_supported(Language::Json));
-        assert!(!manager.is_supported(Language::Markdown));
+        assert!(manager.is_supported(Language::Markdown));
+        assert!(!manager.is_supported(Language::Toml));
     }
 }
diff --git a/src/chunker/parser.rs b/src/chunker/parser.rs
index a4fc193..0efe8b3 100644
--- a/src/chunker/parser.rs
+++ b/src/chunker/parser.rs
@@ -280,7 +280,8 @@ fn baz() {}
         let mut parser = CodeParser::new();
         let source = "some code";
 
-        let result = parser.parse(Language::Markdown, source);
+        // Toml has no compiled-in grammar, so parsing must fail.
+        let result = parser.parse(Language::Toml, source);
         assert!(result.is_err());
     }
 
diff --git a/src/chunker/semantic.rs b/src/chunker/semantic.rs
index b2557ab..1e061bb 100644
--- a/src/chunker/semantic.rs
+++ b/src/chunker/semantic.rs
@@ -42,12 +42,25 @@ impl SemanticChunker {
         path: &Path,
         content: &str,
     ) -> Result<Vec<Chunk>> {
+        // Markdown/txt are chunked by heading section rather than by definition
+        // node, so they take a dedicated path (no LanguageExtractor).
+        if language == Language::Markdown {
+            return self.chunk_markdown(path, content);
+        }
+
         // 1. Check if we have an extractor for this language
         let extractor = match get_extractor(language) {
             Some(ext) => ext,
             None => {
-                // Fall back to simple chunking for unsupported languages
-                return Ok(self.fallback_chunk(path, content));
+                // Fall back to simple chunking for unsupported languages.  The
+                // line-windowed fallback ignores the char budget, so route its
+                // output through split_oversized to enforce max_chunk_chars and
+                // avoid pathological huge single chunks (e.g. minified one-line text).
+                return Ok(self
+                    .fallback_chunk(path, content)
+                    .into_iter()
+                    .flat_map(|c| self.split_oversized(c))
+                    .collect());
             }
         };
 
@@ -89,6 +102,180 @@ impl SemanticChunker {
         Ok(final_chunks)
     }
 
+    /// Chunk a Markdown/text file by heading section.
+    ///
+    /// Uses the tree-sitter-md *block* grammar: the document is a tree of nested
+    /// `section` nodes (one per heading). Each chunk is a single heading plus its
+    /// own prose/code, *excluding* nested subsections (which become their own
+    /// chunks). Heading text is carried in the breadcrumb context so the embedding
+    /// captures the section's place in the document (e.g. `File: x.md > Title >
+    /// Subsection`). Leading document content (YAML front-matter, prose before the
+    /// first heading) becomes a single preamble chunk. Oversized sections are
+    /// char/line-bounded via `split_if_needed`, and a file with no parseable
+    /// structure falls back to the line-windowed chunker (also bounded).
+    fn chunk_markdown(&mut self, path: &Path, content: &str) -> Result<Vec<Chunk>> {
+        let bounded_fallback = |this: &Self| -> Vec<Chunk> {
+            this.fallback_chunk(path, content)
+                .into_iter()
+                .flat_map(|c| this.split_oversized(c))
+                .collect()
+        };
+
+        let parsed = match self.parser.parse(Language::Markdown, content) {
+            Ok(p) => p,
+            Err(_) => return Ok(bounded_fallback(self)),
+        };
+
+        let source = content.as_bytes();
+        let path_str = normalize_path(path);
+        let file_context = format!("File: {}", path_str);
+        let root = parsed.root_node();
+
+        let mut cursor = root.walk();
+        let top: Vec<Node> = root.named_children(&mut cursor).collect();
+
+        let mut chunks: Vec<Chunk> = Vec::new();
+
+        // Leading non-section nodes (front-matter / prose before the first heading)
+        // form a single preamble chunk.
+        let mut preamble_end = 0;
+        while preamble_end < top.len() && top[preamble_end].kind() != "section" {
+            preamble_end += 1;
+        }
+        if preamble_end > 0 {
+            let start_byte = top[0].start_byte();
+            let end_byte = top[preamble_end - 1].end_byte();
+            if let Some(chunk) = Self::md_chunk(
+                source,
+                start_byte,
+                end_byte,
+                top[0].start_position().row,
+                std::slice::from_ref(&file_context),
+                &path_str,
+            ) {
+                chunks.push(chunk);
+            }
+        }
+
+        // Each top-level section (and, recursively, its subsections) becomes a chunk.
+        for node in top.iter().filter(|n| n.kind() == "section") {
+            self.emit_md_section(
+                *node,
+                source,
+                &path_str,
+                std::slice::from_ref(&file_context),
+                &mut chunks,
+            );
+        }
+
+        if chunks.is_empty() {
+            return Ok(bounded_fallback(self));
+        }
+
+        let source_lines: Vec<&str> = content.lines().collect();
+        self.populate_context_windows(&mut chunks, &source_lines);
+
+        let final_chunks = chunks
+            .into_iter()
+            .flat_map(|c| self.split_oversized(c))
+            .collect();
+        Ok(final_chunks)
+    }
+
+    /// Emit a chunk for one `section` node (heading + direct content), then recurse
+    /// into nested subsections with an extended breadcrumb.
+    fn emit_md_section(
+        &self,
+        section: Node,
+        source: &[u8],
+        path_str: &str,
+        context_stack: &[String],
+        chunks: &mut Vec<Chunk>,
+    ) {
+        let mut cursor = section.walk();
+        let children: Vec<Node> = section.named_children(&mut cursor).collect();
+
+        // Heading text (if the section opens with one) extends the breadcrumb.
+        let heading_text = children
+            .first()
+            .filter(|c| Self::md_is_heading(c.kind()))
+            .map(|h| Self::md_heading_text(*h, source))
+            .unwrap_or_default();
+
+        let mut new_context = context_stack.to_vec();
+        if !heading_text.is_empty() {
+            new_context.push(heading_text);
+        }
+
+        // Direct content = section start .. first nested subsection (exclusive).
+        let first_sub = children.iter().find(|c| c.kind() == "section");
+        let end_byte = first_sub.map_or_else(|| section.end_byte(), |s| s.start_byte());
+        if let Some(chunk) = Self::md_chunk(
+            source,
+            section.start_byte(),
+            end_byte,
+            section.start_position().row,
+            &new_context,
+            path_str,
+        ) {
+            chunks.push(chunk);
+        }
+
+        for child in children.iter().filter(|c| c.kind() == "section") {
+            self.emit_md_section(*child, source, path_str, &new_context, chunks);
+        }
+    }
+
+    /// Build a Markdown chunk from a byte range, or None if it is blank.
+    fn md_chunk(
+        source: &[u8],
+        start_byte: usize,
+        end_byte: usize,
+        start_line: usize,
+        context: &[String],
+        path_str: &str,
+    ) -> Option<Chunk> {
+        let text = std::str::from_utf8(source.get(start_byte..end_byte)?).ok()?;
+        if text.trim().is_empty() {
+            return None;
+        }
+        let line_count = text.lines().count().max(1);
+        let mut chunk = Chunk::new(
+            text.to_string(),
+            start_line,
+            start_line + line_count,
+            ChunkKind::Block,
+            path_str.to_string(),
+        );
+        chunk.context = context.to_vec();
+        Some(chunk)
+    }
+
+    /// True if a node kind is a Markdown heading.
+    fn md_is_heading(kind: &str) -> bool {
+        kind == "atx_heading" || kind == "setext_heading"
+    }
+
+    /// Extract clean heading text (no `#` markers / underline).
+    fn md_heading_text(node: Node, source: &[u8]) -> String {
+        // atx_heading exposes the text via the `heading_content` field.
+        if let Some(inline) = node.child_by_field_name("heading_content") {
+            if let Ok(t) = inline.utf8_text(source) {
+                return t.trim().to_string();
+            }
+        }
+        // Fallback (e.g. setext_heading): first line, stripped of '#'.
+        node.utf8_text(source)
+            .unwrap_or("")
+            .lines()
+            .next()
+            .unwrap_or("")
+            .trim()
+            .trim_matches('#')
+            .trim()
+            .to_string()
+    }
+
     /// Populate context_prev and context_next for each chunk
     fn populate_context_windows(&self, chunks: &mut [Chunk], source_lines: &[&str]) {
         let total_lines = source_lines.len();
@@ -257,6 +444,88 @@ impl SemanticChunker {
         chunks
     }
 
+    /// Char- *and* line-aware splitter for unstructured text (Markdown/txt and the
+    /// generic fallback). Unlike `split_if_needed`, which windows purely by line
+    /// count, this also enforces `max_chunk_chars`: a single physical line longer
+    /// than the char budget is hard-split on UTF-8 boundaries. This is what keeps
+    /// scraped HTML/markdown — which can be one 80 KB line — from producing a single
+    /// enormous chunk. The structured code path keeps using `split_if_needed`, so
+    /// code chunking is unchanged.
+    fn split_oversized(&self, chunk: Chunk) -> Vec<Chunk> {
+        if chunk.line_count() <= self.max_chunk_lines && chunk.size_bytes() <= self.max_chunk_chars
+        {
+            return vec![chunk];
+        }
+
+        // 1. Expand into "units": one per line, but any line over the char budget is
+        //    fragmented on char boundaries so no single unit exceeds max_chunk_chars.
+        let mut units: Vec<String> = Vec::new();
+        for line in chunk.content.lines() {
+            if line.len() <= self.max_chunk_chars {
+                units.push(line.to_string());
+                continue;
+            }
+            let mut frag = String::new();
+            for ch in line.chars() {
+                if !frag.is_empty() && frag.len() + ch.len_utf8() > self.max_chunk_chars {
+                    units.push(std::mem::take(&mut frag));
+                }
+                frag.push(ch);
+            }
+            if !frag.is_empty() {
+                units.push(frag);
+            }
+        }
+
+        if units.is_empty() {
+            return vec![chunk];
+        }
+
+        // 2. Greedily window units, bounded by both max_chunk_lines and
+        //    max_chunk_chars. Windows advance without overlap (context_prev/next
+        //    already supply surrounding lines), so no content is duplicated.
+        let mut out: Vec<Chunk> = Vec::new();
+        let mut i = 0;
+        let mut split_index = 0;
+        while i < units.len() {
+            let mut j = i;
+            let mut char_count = 0usize;
+            while j < units.len()
+                && (j - i) < self.max_chunk_lines
+                && (j == i || char_count + units[j].len() < self.max_chunk_chars)
+            {
+                char_count += units[j].len() + 1;
+                j += 1;
+            }
+            let end = if j > i { j } else { i + 1 };
+
+            let content = units[i..end].join("\n");
+            let mut piece = Chunk::new(
+                content,
+                chunk.start_line + i,
+                chunk.start_line + end,
+                chunk.kind,
+                chunk.path.clone(),
+            );
+            piece.context = chunk.context.clone();
+            piece.signature = chunk.signature.clone();
+            piece.is_complete = false;
+            piece.split_index = Some(split_index);
+            out.push(piece);
+
+            split_index += 1;
+            i = end;
+        }
+
+        // A single resulting piece means no real split happened — keep it whole.
+        if out.len() == 1 {
+            out[0].is_complete = true;
+            out[0].split_index = None;
+        }
+
+        out
+    }
+
     /// Split a chunk if it exceeds size limits
     fn split_if_needed(&self, chunk: Chunk) -> Vec<Chunk> {
         let line_count = chunk.line_count();
@@ -586,6 +855,118 @@ class Calculator:
         );
     }
 
+    #[test]
+    fn test_chunk_markdown_sections() {
+        let mut chunker = SemanticChunker::new(100, 2000, 10);
+
+        let md = "---\nsource: dam_help\ntitle: E-mail ordering\nurl: https://help.aprimo.com/x\npath: dam_help/Ordering/EmailOrd\n---\n\n# E-mail ordering\n\nIntro paragraph about ordering.\n\n## Configure SMTP\n\nSteps to configure the mail server.\n\n## Troubleshooting\n\nFinal section text about errors.\n";
+
+        let path = Path::new("EmailOrd.md");
+        let chunks = chunker
+            .chunk_semantic(Language::Markdown, path, md)
+            .unwrap();
+
+        // Preamble (front-matter) + h1 intro + 2 h2 sections = at least 4 chunks.
+        assert!(
+            chunks.len() >= 4,
+            "Expected >=4 section chunks, got {}",
+            chunks.len()
+        );
+
+        // No chunk should span the whole page: the "Configure SMTP" body and the
+        // "Troubleshooting" body must live in *different* chunks.
+        let smtp = chunks
+            .iter()
+            .find(|c| c.content.contains("Steps to configure"))
+            .expect("should have a Configure SMTP chunk");
+        assert!(
+            !smtp.content.contains("Final section text"),
+            "sections must not be merged into a whole-page block"
+        );
+
+        // Breadcrumb context must carry the heading path (document title + section).
+        assert!(smtp.context.iter().any(|c| c.contains("E-mail ordering")));
+        assert!(smtp.context.iter().any(|c| c.contains("Configure SMTP")));
+
+        // Every chunk stays within the char budget.
+        assert!(chunks.iter().all(|c| c.content.len() <= 2000));
+    }
+
+    #[test]
+    fn test_chunk_markdown_nested_breadcrumb() {
+        let mut chunker = SemanticChunker::new(100, 2000, 10);
+        let md = "# Top\n\nlead\n\n## Middle\n\nmid body\n\n### Deep\n\ndeep body here\n";
+        let chunks = chunker
+            .chunk_semantic(Language::Markdown, Path::new("n.md"), md)
+            .unwrap();
+
+        let deep = chunks
+            .iter()
+            .find(|c| c.content.contains("deep body here"))
+            .expect("should find deep section");
+        // File > Top > Middle > Deep
+        assert!(deep.context.iter().any(|c| c.contains("Top")));
+        assert!(deep.context.iter().any(|c| c.contains("Middle")));
+        assert!(deep.context.iter().any(|c| c.contains("Deep")));
+        // The deep chunk must not contain its ancestors' bodies.
+        assert!(!deep.content.contains("mid body"));
+        assert!(!deep.content.contains("lead"));
+    }
+
+    #[test]
+    fn test_chunk_markdown_oversized_section_split() {
+        let mut chunker = SemanticChunker::new(100, 200, 5);
+        let big_body = (0..50)
+            .map(|i| format!("line of section body number {}", i))
+            .collect::<Vec<_>>()
+            .join("\n");
+        let md = format!("# Heading\n\n{}\n", big_body);
+
+        let chunks = chunker
+            .chunk_semantic(Language::Markdown, Path::new("big.md"), &md)
+            .unwrap();
+
+        // A single >200-char section must be split into multiple bounded parts.
+        assert!(chunks.len() > 1, "oversized section should be split");
+        assert!(chunks.iter().any(|c| !c.is_complete));
+    }
+
+    #[test]
+    fn test_chunk_markdown_hard_splits_long_line() {
+        // Mirrors real scraped Aprimo docs: a section whose body is ONE huge line
+        // (no internal newlines). Line-based splitting can't bound this; the
+        // char-aware splitter must.
+        let mut chunker = SemanticChunker::new(100, 500, 10);
+        let long_line = "word ".repeat(2000); // ~10_000 chars, single line
+        let md = format!("# Title\n\n{}\n", long_line);
+
+        let chunks = chunker
+            .chunk_semantic(Language::Markdown, Path::new("huge.md"), &md)
+            .unwrap();
+
+        assert!(chunks.len() > 1, "a single 10KB line must be hard-split");
+        assert!(
+            chunks.iter().all(|c| c.content.len() <= 500),
+            "every piece must respect the char budget; got max {}",
+            chunks.iter().map(|c| c.content.len()).max().unwrap_or(0)
+        );
+    }
+
+    #[test]
+    fn test_chunk_markdown_no_headings_falls_back() {
+        let mut chunker = SemanticChunker::new(100, 2000, 10);
+        let md = "Just some plain text\nwith a few lines\nand no headings at all.\n";
+        let chunks = chunker
+            .chunk_semantic(Language::Markdown, Path::new("plain.txt"), md)
+            .unwrap();
+
+        assert!(!chunks.is_empty());
+        // All content is preserved across chunks.
+        let joined: String = chunks.iter().map(|c| c.content.clone()).collect();
+        assert!(joined.contains("plain text"));
+        assert!(joined.contains("no headings"));
+    }
+
     #[test]
     fn test_chunk_unsupported_language() {
         let mut chunker = SemanticChunker::new(100, 2000, 10);
diff --git a/src/file/language.rs b/src/file/language.rs
index cd2f310..f164a23 100644
--- a/src/file/language.rs
+++ b/src/file/language.rs
@@ -105,6 +105,7 @@ impl Language {
                 | Self::Php
                 | Self::Yaml
                 | Self::Json
+                | Self::Markdown
         )
     }
 
@@ -176,7 +177,9 @@ mod tests {
         assert!(Language::Python.supports_tree_sitter());
         assert!(Language::TypeScript.supports_tree_sitter());
         assert!(Language::Json.supports_tree_sitter());
-        assert!(!Language::Markdown.supports_tree_sitter());
+        assert!(Language::Markdown.supports_tree_sitter());
+        // Toml has no tree-sitter grammar yet.
+        assert!(!Language::Toml.supports_tree_sitter());
     }
 
     #[test]

From 1dd9cac3160a96707ae679a09bd3be32435695df Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Mon, 1 Jun 2026 23:57:38 +0200
Subject: [PATCH 5/7] [worker] final review: fix chunk_markdown doc comment

Reference the actual splitter used by the markdown path
(split_oversized, char-aware) instead of split_if_needed
(the code path's line-based splitter).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock              | 2 +-
 Cargo.toml              | 2 +-
 src/chunker/semantic.rs | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cdc968a..bd5d932 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -628,7 +628,7 @@ dependencies = [
 
 [[package]]
 name = "codesearch"
-version = "1.0.143"
+version = "1.0.144"
 dependencies = [
  "anyhow",
  "arroy",
diff --git a/Cargo.toml b/Cargo.toml
index 014cd5c..aba7d4e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codesearch"
-version = "1.0.143"
+version = "1.0.144"
 edition = "2021"
 authors = ["codesearch contributors"]
 license = "Apache-2.0"
diff --git a/src/chunker/semantic.rs b/src/chunker/semantic.rs
index 1e061bb..6f30284 100644
--- a/src/chunker/semantic.rs
+++ b/src/chunker/semantic.rs
@@ -111,7 +111,7 @@ impl SemanticChunker {
     /// captures the section's place in the document (e.g. `File: x.md > Title >
     /// Subsection`). Leading document content (YAML front-matter, prose before the
     /// first heading) becomes a single preamble chunk. Oversized sections are
-    /// char/line-bounded via `split_if_needed`, and a file with no parseable
+    /// char/line-bounded via `split_oversized`, and a file with no parseable
     /// structure falls back to the line-windowed chunker (also bounded).
     fn chunk_markdown(&mut self, path: &Path, content: &str) -> Result<Vec<Chunk>> {
         let bounded_fallback = |this: &Self| -> Vec<Chunk> {

From 7a055663a6240e245e490bd62cbbec89dff0f0cc Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Tue, 2 Jun 2026 00:26:24 +0200
Subject: [PATCH 6/7] docs: document semantic Markdown chunking + correct
 language table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- CHANGELOG: add [1.0.145] entry for tree-sitter-md block-grammar Markdown
  chunking (sections/headings/code fences).
- README: expand the Supported Languages table to all 15 tree-sitter
  languages and bump the "9 languages" count to 15 — correcting pre-existing
  drift that omitted Shell, Ruby, PHP, YAML, JSON, and (new) Markdown.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md | 16 ++++++++++++++++
 Cargo.lock   |  2 +-
 Cargo.toml   |  2 +-
 README.md    | 21 ++++++++++++++-------
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c4a3d2..5a92a5a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+## [1.0.145] - 2026-06-02
+
+### Added
+
+- **Semantic Markdown chunking** — Markdown files (`.md`, `.markdown`, `.txt`) are
+  now parsed with the **tree-sitter-md block grammar**, so chunks align to sections,
+  headings, and code fences instead of arbitrary line ranges. `Language::Markdown`
+  now reports `supports_tree_sitter() == true` and has a compiled-in grammar.
+
+### Changed
+
+- **Supported-languages documentation corrected** — the README language table now
+  lists all 15 tree-sitter languages actually supported (Rust, Python, JavaScript,
+  TypeScript, C, C++, C#, Go, Java, Shell, Ruby, PHP, YAML, JSON, Markdown);
+  it previously showed only 9, omitting Shell, Ruby, PHP, YAML, JSON, and Markdown.
+
 ## [1.0.142] - 2026-06-01
 
 ### Fixed
diff --git a/Cargo.lock b/Cargo.lock
index bd5d932..a2d255e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -628,7 +628,7 @@ dependencies = [
 
 [[package]]
 name = "codesearch"
-version = "1.0.144"
+version = "1.0.145"
 dependencies = [
  "anyhow",
  "arroy",
diff --git a/Cargo.toml b/Cargo.toml
index aba7d4e..b15568c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codesearch"
-version = "1.0.144"
+version = "1.0.145"
 edition = "2021"
 authors = ["codesearch contributors"]
 license = "Apache-2.0"
diff --git a/README.md b/README.md
index e4afec5..8ee61fe 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ codesearch gives AI agents (OpenCode, Claude Code, Cursor, and any MCP client) d
 - **Multi-repo serve mode**: Fan-out queries across repository groups with cross-repo RRF ranking
 - **Hybrid retrieval**: Vector embeddings + BM25 full-text search fused with Reciprocal Rank Fusion
 - **Symbol navigation**: Jump to definitions, find usages, trace imports and dependents — in the same tool
-- **AST-aware chunking**: Tree-sitter parsing for 9 languages — chunks align to functions/classes, not arbitrary line ranges
+- **AST-aware chunking**: Tree-sitter parsing for 15 languages — chunks align to functions/classes (and Markdown sections), not arbitrary line ranges
 - **Token-efficient**: Returns metadata by default; agents fetch full code only when needed via `get_chunk`
 - **Lightweight footprint**: Hundreds of MB on disk, runs on CPU only, no runtime model downloads (works behind enterprise proxies)
 - **Zero config for single repos**: `codesearch index && codesearch mcp` — done
@@ -410,16 +410,23 @@ Tree-sitter AST-aware chunking:
 | Language | Extensions |
 |----------|-----------|
 | Rust | `.rs` |
-| Python | `.py` |
-| JavaScript | `.js`, `.jsx` |
-| TypeScript | `.ts`, `.tsx` |
+| Python | `.py`, `.pyw`, `.pyi` |
+| JavaScript | `.js`, `.mjs`, `.cjs` |
+| TypeScript | `.ts`, `.tsx`, `.jsx`, `.mts`, `.cts` |
 | C | `.c`, `.h` |
-| C++ | `.cpp`, `.hpp` |
+| C++ | `.cpp`, `.cc`, `.cxx`, `.hpp`, `.hxx` |
 | C# | `.cs` |
 | Go | `.go` |
 | Java | `.java` |
-
-All other text files use line-based chunking as fallback.
+| Shell | `.sh`, `.bash`, `.zsh` |
+| Ruby | `.rb`, `.rake` |
+| PHP | `.php` |
+| YAML | `.yaml`, `.yml` |
+| JSON | `.json` |
+| Markdown | `.md`, `.markdown`, `.txt` |
+
+Markdown uses the tree-sitter-md **block** grammar — chunks align to sections,
+headings, and code fences. All other text files use line-based chunking as fallback.
 
 ## Core Technology
 

From cda4a5d49c97aa2ebe9dae1711acb465ae5d1c91 Mon Sep 17 00:00:00 2001
From: flupkede <flupkede@users.noreply.github.com>
Date: Tue, 2 Jun 2026 00:40:26 +0200
Subject: [PATCH 7/7] fix(test): sanitize customer ref in markdown chunking
 fixtures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pre-push customer-ref guard flagged "aprimo" in two semantic.rs test
fixtures (a frontmatter URL and a comment). Replaced with generic
example.com / "real-world scraped docs" — the test assertions never
reference either, so behavior is unchanged. Realign CHANGELOG heading to
the post-bump version (1.0.146).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md            | 2 +-
 Cargo.lock              | 2 +-
 Cargo.toml              | 2 +-
 src/chunker/semantic.rs | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5a92a5a..acb6322 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
-## [1.0.145] - 2026-06-02
+## [1.0.146] - 2026-06-02
 
 ### Added
 
diff --git a/Cargo.lock b/Cargo.lock
index a2d255e..c22ffaa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -628,7 +628,7 @@ dependencies = [
 
 [[package]]
 name = "codesearch"
-version = "1.0.145"
+version = "1.0.146"
 dependencies = [
  "anyhow",
  "arroy",
diff --git a/Cargo.toml b/Cargo.toml
index b15568c..004caa1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "codesearch"
-version = "1.0.145"
+version = "1.0.146"
 edition = "2021"
 authors = ["codesearch contributors"]
 license = "Apache-2.0"
diff --git a/src/chunker/semantic.rs b/src/chunker/semantic.rs
index 6f30284..3b8bc5a 100644
--- a/src/chunker/semantic.rs
+++ b/src/chunker/semantic.rs
@@ -859,7 +859,7 @@ class Calculator:
     fn test_chunk_markdown_sections() {
         let mut chunker = SemanticChunker::new(100, 2000, 10);
 
-        let md = "---\nsource: dam_help\ntitle: E-mail ordering\nurl: https://help.aprimo.com/x\npath: dam_help/Ordering/EmailOrd\n---\n\n# E-mail ordering\n\nIntro paragraph about ordering.\n\n## Configure SMTP\n\nSteps to configure the mail server.\n\n## Troubleshooting\n\nFinal section text about errors.\n";
+        let md = "---\nsource: dam_help\ntitle: E-mail ordering\nurl: https://help.example.com/x\npath: dam_help/Ordering/EmailOrd\n---\n\n# E-mail ordering\n\nIntro paragraph about ordering.\n\n## Configure SMTP\n\nSteps to configure the mail server.\n\n## Troubleshooting\n\nFinal section text about errors.\n";
 
         let path = Path::new("EmailOrd.md");
         let chunks = chunker
@@ -933,7 +933,7 @@ class Calculator:
 
     #[test]
     fn test_chunk_markdown_hard_splits_long_line() {
-        // Mirrors real scraped Aprimo docs: a section whose body is ONE huge line
+        // Mirrors real-world scraped docs: a section whose body is ONE huge line
         // (no internal newlines). Line-based splitting can't bound this; the
         // char-aware splitter must.
         let mut chunker = SemanticChunker::new(100, 500, 10);