diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..138e464 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,59 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + - "dev*" + - "canary*" + - "alpha*" + - "beta*" + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 1 + show-progress: false + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Setup Node + uses: actions/setup-node@v6 + with: + node-version: 20 + + - name: Setup Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: wasm32-unknown-unknown + + - name: Cache Rust artifacts + uses: Swatinem/rust-cache@v2 + with: + workspaces: crates/language-detector + + - name: Install wasm-pack + run: cargo install wasm-pack --locked + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Type check + run: bun run type-check + + - name: Build + run: bun run build + + - name: Verify package contents + run: bun run verify:package-contents + + - name: Test + run: bun test diff --git a/.github/workflows/publish-github-packages.yml b/.github/workflows/publish-github-packages.yml deleted file mode 100644 index 4ba5fc5..0000000 --- a/.github/workflows/publish-github-packages.yml +++ /dev/null @@ -1,145 +0,0 @@ -name: Publish to GitHub Packages - -on: - push: - tags: - - "*" - workflow_dispatch: - inputs: - tag: - description: "Tag to publish (e.g. v1.2.3)" - required: true - type: string - shallow_since: - description: "Optional shallow fetch cutoff date (YYYY-MM-DD) for branch validation; leave blank to unshallow." - required: false - type: string - -permissions: - contents: read - packages: write - -jobs: - publish: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v6 - with: - # Keep checkout light; branch validation will deepen/unshallow if needed. - fetch-depth: 1 - ref: ${{ inputs.tag && format('refs/tags/{0}', inputs.tag) || github.ref }} - show-progress: false - - - name: Resolve tag - id: tag - env: - TAG_INPUT: ${{ inputs.tag }} - run: | - CLEAN_TAG_INPUT=$(printf '%s' "$TAG_INPUT" | tr -d '\r' | xargs) - if [ -n "$CLEAN_TAG_INPUT" ]; then - echo "tag=$CLEAN_TAG_INPUT" >> "$GITHUB_OUTPUT" - else - echo "tag=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT" - fi - - - name: Ensure tag is on allowed branch - env: - SHALLOW_SINCE: ${{ inputs.shallow_since }} - run: | - FETCH_DEPTH_ARGS=() - if [ -n "$SHALLOW_SINCE" ]; then - FETCH_DEPTH_ARGS=(--shallow-since="$SHALLOW_SINCE") - elif [ -f .git/shallow ]; then - git fetch origin --tags --unshallow - fi - - git fetch origin --tags "${FETCH_DEPTH_ARGS[@]}" - git fetch origin --no-tags main "${FETCH_DEPTH_ARGS[@]}" - git fetch origin \ - --no-tags \ - "refs/heads/beta*:refs/remotes/origin/beta*" \ - "refs/heads/alpha*:refs/remotes/origin/alpha*" \ - "refs/heads/canary*:refs/remotes/origin/canary*" \ - "refs/heads/dev*:refs/remotes/origin/dev*" \ - "${FETCH_DEPTH_ARGS[@]}" || true - TAG_COMMIT=$(git rev-list -n 1 "${{ steps.tag.outputs.tag }}") - if git merge-base --is-ancestor "$TAG_COMMIT" "origin/main"; then - echo "Tag commit is on main." - exit 0 - fi - - MATCHING_BRANCH=$(git for-each-ref --format='%(refname:short)' refs/remotes/origin \ - | grep -Ei 'origin/.*(beta|alpha|canary|dev)' \ - | while read -r branch; do - if git merge-base --is-ancestor "$TAG_COMMIT" "$branch"; then - echo "$branch" - break - fi - done) - - note() { echo "$1"; } - if [ -n "$MATCHING_BRANCH" ]; then - note "Tag commit is on allowed branch: $MATCHING_BRANCH" - else - note "Tag commit is not on main or an allowed branch (beta/alpha/canary/dev)." >&2 - exit 1 - fi - - - name: Check pre-release status - id: prerelease - env: - TAG_NAME: ${{ steps.tag.outputs.tag }} - run: | - PRERELEASE_LABEL="" - if [[ "$TAG_NAME" == *-alpha* ]]; then - PRERELEASE_LABEL="alpha" - elif [[ "$TAG_NAME" == *-beta* ]]; then - PRERELEASE_LABEL="beta" - elif [[ "$TAG_NAME" == *-rc* ]]; then - PRERELEASE_LABEL="rc" - elif [[ "$TAG_NAME" == *-canary* ]]; then - PRERELEASE_LABEL="canary" - fi - - if [[ -n "$PRERELEASE_LABEL" ]]; then - echo "IS_PRERELEASE=true" >> "$GITHUB_OUTPUT" - else - echo "IS_PRERELEASE=false" >> "$GITHUB_OUTPUT" - fi - - echo "PRERELEASE_LABEL=$PRERELEASE_LABEL" >> "$GITHUB_OUTPUT" - - - name: Setup Bun - uses: oven-sh/setup-bun@v2 - with: - bun-version: latest - - - name: Setup Node - uses: actions/setup-node@v6 - with: - node-version: 20 - registry-url: https://npm.pkg.github.com - scope: "@${{ github.repository_owner }}" - - - name: Install dependencies - run: bun install --frozen-lockfile - - - name: Build - run: bun run build - - - name: Set package name for GitHub Packages - run: npm pkg set name=@${{ github.repository_owner }}/word-counter - - - name: Publish - env: - NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - if [[ "${{ steps.prerelease.outputs.IS_PRERELEASE }}" == "true" ]]; then - npm publish --registry https://npm.pkg.github.com --access restricted --tag next - PKG_NAME=$(node -p "require('./package.json').name") - PKG_VERSION=$(node -p "require('./package.json').version") - npm dist-tag add "$PKG_NAME@$PKG_VERSION" "${{ steps.prerelease.outputs.PRERELEASE_LABEL }}" --registry https://npm.pkg.github.com - else - npm publish --registry https://npm.pkg.github.com --access restricted - fi diff --git a/.github/workflows/publish-npm-packages.yml b/.github/workflows/publish-npm-packages.yml deleted file mode 100644 index 5a789ce..0000000 --- a/.github/workflows/publish-npm-packages.yml +++ /dev/null @@ -1,150 +0,0 @@ -name: Publish to NPM Packages - -on: - push: - tags: - - "*" - - "!*-alpha*" - - "!*-beta*" - workflow_dispatch: - inputs: - tag: - description: "Tag to publish (e.g. v1.2.3)" - required: true - type: string - shallow_since: - description: "Optional shallow fetch cutoff date (YYYY-MM-DD) for branch validation; leave blank to unshallow." - required: false - type: string - -permissions: - id-token: write # Required for OIDC authentication - contents: read - -jobs: - publish: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v6 - with: - # Keep checkout light; branch validation will deepen/unshallow if needed. - fetch-depth: 1 - ref: ${{ inputs.tag && format('refs/tags/{0}', inputs.tag) || github.ref }} - show-progress: false - - - name: Resolve tag - id: tag - env: - TAG_INPUT: ${{ inputs.tag }} - run: | - CLEAN_TAG_INPUT=$(printf '%s' "$TAG_INPUT" | tr -d '\r' | xargs) - if [ -n "$CLEAN_TAG_INPUT" ]; then - echo "tag=$CLEAN_TAG_INPUT" >> "$GITHUB_OUTPUT" - else - echo "tag=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT" - fi - - - name: Ensure tag is on allowed branch - env: - SHALLOW_SINCE: ${{ inputs.shallow_since }} - run: | - FETCH_DEPTH_ARGS=() - if [ -n "$SHALLOW_SINCE" ]; then - FETCH_DEPTH_ARGS=(--shallow-since="$SHALLOW_SINCE") - elif [ -f .git/shallow ]; then - git fetch origin --tags --unshallow - fi - - git fetch origin --tags "${FETCH_DEPTH_ARGS[@]}" - git fetch origin --no-tags main "${FETCH_DEPTH_ARGS[@]}" - git fetch origin \ - --no-tags \ - "refs/heads/beta*:refs/remotes/origin/beta*" \ - "refs/heads/alpha*:refs/remotes/origin/alpha*" \ - "refs/heads/canary*:refs/remotes/origin/canary*" \ - "refs/heads/dev*:refs/remotes/origin/dev*" \ - "${FETCH_DEPTH_ARGS[@]}" || true - TAG_COMMIT=$(git rev-list -n 1 "${{ steps.tag.outputs.tag }}") - if git merge-base --is-ancestor "$TAG_COMMIT" "origin/main"; then - echo "Tag commit is on main." - exit 0 - fi - - MATCHING_BRANCH=$(git for-each-ref --format='%(refname:short)' refs/remotes/origin \ - | grep -Ei 'origin/.*(beta|alpha|canary|dev)' \ - | while read -r branch; do - if git merge-base --is-ancestor "$TAG_COMMIT" "$branch"; then - echo "$branch" - break - fi - done) - - note() { echo "$1"; } - if [ -n "$MATCHING_BRANCH" ]; then - note "Tag commit is on allowed branch: $MATCHING_BRANCH" - else - note "Tag commit is not on main or an allowed branch (beta/alpha/canary/dev)." >&2 - exit 1 - fi - - - name: Check pre-release status - id: prerelease - env: - TAG_NAME: ${{ steps.tag.outputs.tag }} - run: | - if [[ "$TAG_NAME" == *-alpha* || "$TAG_NAME" == *-beta* || "$TAG_NAME" == *-rc* || "$TAG_NAME" == *-canary* ]]; then - echo "IS_PRERELEASE=true" >> "$GITHUB_OUTPUT" - else - echo "IS_PRERELEASE=false" >> "$GITHUB_OUTPUT" - fi - - - name: Block unsupported pre-release channels - env: - TAG_NAME: ${{ steps.tag.outputs.tag }} - run: | - if [[ "$TAG_NAME" == *-alpha* || "$TAG_NAME" == *-beta* ]]; then - echo "Alpha and beta tags are intentionally excluded from npm publishing in this workflow: '$TAG_NAME'." >&2 - exit 1 - fi - - - name: Setup Bun - uses: oven-sh/setup-bun@v2 - with: - bun-version: latest - - - name: Setup Node - uses: actions/setup-node@v6 - with: - node-version: 22.14.0 - registry-url: https://registry.npmjs.org - - - name: Update npm for trusted publishing - run: npm install -g npm@latest - - - name: Install dependencies - run: bun install --frozen-lockfile - - - name: Ensure tag matches package version - env: - TAG_NAME: ${{ steps.tag.outputs.tag }} - run: | - PKG_VERSION=$(node -p "require('./package.json').version") - if [[ "$TAG_NAME" != "$PKG_VERSION" && "$TAG_NAME" != "v$PKG_VERSION" ]]; then - echo "Tag '$TAG_NAME' does not match package version '$PKG_VERSION'." >&2 - exit 1 - fi - - - name: Build - run: bun run build - - - name: Publish - env: - # Clear inherited tokens so npm uses GitHub OIDC trusted publishing. - NODE_AUTH_TOKEN: "" - run: | - if [[ "${{ steps.prerelease.outputs.IS_PRERELEASE }}" == "true" ]]; then - npm publish --registry https://registry.npmjs.org --access public --tag next - else - npm publish --registry https://registry.npmjs.org --access public - fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8a021f0..3ca5431 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,6 +24,8 @@ jobs: outputs: tag: ${{ steps.tag.outputs.tag }} prerelease: ${{ steps.prerelease.outputs.IS_PRERELEASE }} + prerelease_label: ${{ steps.prerelease.outputs.PRERELEASE_LABEL }} + npm_publish_enabled: ${{ steps.prerelease.outputs.NPM_PUBLISH_ENABLED }} release_notes: ${{ steps.notes.outputs.content }} previous_tag: ${{ steps.range.outputs.previous_tag }} release_range: ${{ steps.range.outputs.range }} @@ -115,6 +117,14 @@ jobs: echo "IS_PRERELEASE=false" >> "$GITHUB_OUTPUT" fi + if [[ "$PRERELEASE_LABEL" == "alpha" || "$PRERELEASE_LABEL" == "beta" ]]; then + echo "NPM_PUBLISH_ENABLED=false" >> "$GITHUB_OUTPUT" + else + echo "NPM_PUBLISH_ENABLED=true" >> "$GITHUB_OUTPUT" + fi + + echo "PRERELEASE_LABEL=$PRERELEASE_LABEL" >> "$GITHUB_OUTPUT" + - name: Resolve release range id: range env: @@ -207,9 +217,165 @@ jobs: echo "__RELEASE_NOTES__" } >> "$GITHUB_OUTPUT" - release: + prepare: runs-on: ubuntu-latest needs: notes + permissions: + contents: read + steps: + - name: Checkout release ref + uses: actions/checkout@v6 + with: + fetch-depth: 1 + ref: ${{ format('refs/tags/{0}', needs.notes.outputs.tag) }} + show-progress: false + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Setup Node + uses: actions/setup-node@v6 + with: + node-version: 20 + + - name: Setup Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: wasm32-unknown-unknown + + - name: Cache Rust artifacts + uses: Swatinem/rust-cache@v2 + with: + workspaces: crates/language-detector + + - name: Install wasm-pack + run: cargo install wasm-pack --locked + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Build + run: bun run build + + - name: Verify package contents + run: bun run verify:package-contents + + - name: Upload release package artifact + uses: actions/upload-artifact@v4 + with: + name: release-package-${{ needs.notes.outputs.tag }} + if-no-files-found: error + path: | + dist + package.json + README.md + LICENSE* + + publish_npm: + runs-on: ubuntu-latest + needs: + - notes + - prepare + permissions: + id-token: write + contents: read + steps: + - name: Download prepared package artifact + uses: actions/download-artifact@v4 + with: + name: release-package-${{ needs.notes.outputs.tag }} + path: release-package + + - name: Setup Node + uses: actions/setup-node@v6 + with: + node-version: 22.14.0 + registry-url: https://registry.npmjs.org + + - name: Update npm for trusted publishing + run: npm install -g npm@latest + + - name: Ensure tag matches package version + working-directory: release-package + env: + TAG_NAME: ${{ needs.notes.outputs.tag }} + run: | + PKG_VERSION=$(node -p "require('./package.json').version") + if [[ "$TAG_NAME" != "$PKG_VERSION" && "$TAG_NAME" != "v$PKG_VERSION" ]]; then + echo "Tag '$TAG_NAME' does not match package version '$PKG_VERSION'." >&2 + exit 1 + fi + + - name: Publish to npm + working-directory: release-package + env: + TAG_NAME: ${{ needs.notes.outputs.tag }} + IS_PRERELEASE: ${{ needs.notes.outputs.prerelease }} + NPM_PUBLISH_ENABLED: ${{ needs.notes.outputs.npm_publish_enabled }} + NODE_AUTH_TOKEN: "" + run: | + if [[ "$NPM_PUBLISH_ENABLED" != "true" ]]; then + echo "Skipping npm publish for unsupported prerelease channel: '$TAG_NAME'." + exit 0 + fi + + if [[ "$IS_PRERELEASE" == "true" ]]; then + npm publish --registry https://registry.npmjs.org --access public --tag next + else + npm publish --registry https://registry.npmjs.org --access public + fi + + publish_github_packages: + runs-on: ubuntu-latest + needs: + - notes + - prepare + permissions: + contents: read + packages: write + steps: + - name: Download prepared package artifact + uses: actions/download-artifact@v4 + with: + name: release-package-${{ needs.notes.outputs.tag }} + path: release-package + + - name: Setup Node + uses: actions/setup-node@v6 + with: + node-version: 20 + registry-url: https://npm.pkg.github.com + scope: "@${{ github.repository_owner }}" + + - name: Set package name for GitHub Packages + working-directory: release-package + run: npm pkg set name=@${{ github.repository_owner }}/word-counter + + - name: Publish to GitHub Packages + working-directory: release-package + env: + IS_PRERELEASE: ${{ needs.notes.outputs.prerelease }} + PRERELEASE_LABEL: ${{ needs.notes.outputs.prerelease_label }} + NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if [[ "$IS_PRERELEASE" == "true" ]]; then + npm publish --registry https://npm.pkg.github.com --access restricted --tag next + PKG_NAME=$(node -p "require('./package.json').name") + PKG_VERSION=$(node -p "require('./package.json').version") + npm dist-tag add "$PKG_NAME@$PKG_VERSION" "$PRERELEASE_LABEL" --registry https://npm.pkg.github.com + else + npm publish --registry https://npm.pkg.github.com --access restricted + fi + + release: + runs-on: ubuntu-latest + needs: + - notes + - publish_npm + - publish_github_packages + if: ${{ always() && needs.notes.result == 'success' && needs.publish_npm.result == 'success' && needs.publish_github_packages.result == 'success' }} permissions: contents: write steps: diff --git a/.gitignore b/.gitignore index a14702c..f7be9e1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ node_modules out dist *.tgz +generated/ # code coverage coverage @@ -27,6 +28,10 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json .cache *.tsbuildinfo +# Rust / Cargo +target/ +Cargo.lock + # IntelliJ based IDEs .idea diff --git a/README.md b/README.md index bf0ba7d..3c72cec 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ For local development in this repository: ```bash git clone https://github.com/dev-pi2pie/word-counter.git cd word-counter +rustup target add wasm32-unknown-unknown +cargo install wasm-pack --locked bun install bun run build npm link @@ -94,6 +96,22 @@ word-counter --han-language zh-Hant "漢字測試" word-counter --han-tag zh-Hans "汉字测试" ``` +Enable the optional WASM detector for ambiguous Latin and Han routes: + +```bash +word-counter --detector wasm "This sentence should clearly be detected as English for the wasm detector path." +word-counter --detector wasm "漢字測試需要更多內容才能觸發偵測" +``` + +Detector mode notes: + +- `--detector regex` is the default behavior. +- `--detector wasm` only runs for ambiguous `und-Latn` and `und-Hani` chunks. +- `--detector regex` keeps the original script/regex chunk-first detection path. +- `--detector wasm` uses a detector-oriented ambiguous-window scoring pass before accepted tags are projected back onto the counting chunks. +- Very short chunks stay on the original `und-*` fallback. +- Low-confidence or unsupported detector results fall back to `und-*`. + Collect non-words (emoji/symbols/punctuation): ```bash @@ -293,6 +311,7 @@ Skip details stay debug-gated and can be suppressed with `--quiet-skips`. - Adjacent characters that share the same locale tag are grouped into a chunk. - Each chunk is counted with `Intl.Segmenter` at `granularity: "word"`, caching segmenters to avoid re-instantiation. - Per-locale counts are summed into an overall total and printed to stdout. +- With `--detector wasm`, ambiguous `und-Latn` and `und-Hani` chunks can be relabeled through the optional WASM detector before counting. ## Locale vs Language Code @@ -316,6 +335,10 @@ import wordCounter, { segmentTextByLocale, showSingularOrPluralWord, } from "@dev-pi2pie/word-counter"; +import { + wordCounterWithDetector, + segmentTextByLocaleWithDetector, +} from "@dev-pi2pie/word-counter/detector"; wordCounter("Hello world", { latinLanguageHint: "en" }); wordCounter("Hello world", { latinTagHint: "en" }); @@ -329,6 +352,11 @@ wordCounter("Hi 👋, world!", { mode: "char", nonWords: true }); wordCounter("飛鳥 bird 貓 cat", { mode: "char-collector" }); wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true }); countCharsForLocale("👋", "en"); +await wordCounterWithDetector( + "This sentence should clearly be detected as English for the wasm detector path.", + { detector: "wasm" }, +); +await segmentTextByLocaleWithDetector("Hello 世界", { detector: "regex" }); ``` Note: `includeWhitespace` only affects results when `nonWords: true` is enabled. @@ -362,6 +390,7 @@ Sample output (with `nonWords: true` and `includeWhitespace: true`): ```js const wordCounter = require("@dev-pi2pie/word-counter"); +const detector = require("@dev-pi2pie/word-counter/detector"); const { countCharsForLocale, countWordsForLocale, @@ -383,6 +412,10 @@ wordCounter("Hi 👋, world!", { mode: "char", nonWords: true }); wordCounter("飛鳥 bird 貓 cat", { mode: "char-collector" }); wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true }); countCharsForLocale("👋", "en"); +await detector.wordCounterWithDetector( + "This sentence should clearly be detected as English for the wasm detector path.", + { detector: "wasm" }, +); ``` Note: `includeWhitespace` only affects results when `nonWords: true` is enabled. @@ -437,6 +470,18 @@ Sample output (with `nonWords: true` and `includeWhitespace: true`): | -------------------------- | -------- | ------------------------------ | | `showSingularOrPluralWord` | function | Formats singular/plural words. | +#### Detector Subpath + +Import from `@dev-pi2pie/word-counter/detector` for the explicit detector-enabled API. + +| Export | Kind | Notes | +| ----------------------------- | -------- | ----------------------------------------------- | +| `wordCounterWithDetector` | function | Async detector-aware counting entrypoint. | +| `segmentTextByLocaleWithDetector` | function | Async detector-aware locale segmentation. | +| `countSectionsWithDetector` | function | Async detector-aware section counting. | +| `DEFAULT_DETECTOR_MODE` | value | Current default detector mode (`regex`). | +| `DETECTOR_MODES` | value | Supported detector modes. | + #### Types | Export | Kind | Notes | @@ -650,6 +695,9 @@ Example JSON (trimmed): - Detection is regex/script based, not statistical language-ID. - Ambiguous Latin defaults to `und-Latn`; Han fallback defaults to `und-Hani`. +- `--detector wasm` is optional and conservative; it only runs for ambiguous chunks that meet minimum script-bearing length thresholds. +- The current first WASM engine is `whatlang`, remapped into this package's public tags. +- The npm package ships one portable WASM artifact; users do not install per-OS detector packages. - Use explicit tag and hint flags when you need deterministic tagging. - Full notes (built-in heuristics, limitations, and override guidance) are tracked in `docs/locale-tag-detection-notes.md`. diff --git a/bun.lock b/bun.lock index d984547..e59307a 100644 --- a/bun.lock +++ b/bun.lock @@ -6,14 +6,14 @@ "name": "word-counter", "dependencies": { "commander": "^14.0.3", - "yaml": "^2.8.2", + "yaml": "^2.8.3", }, "devDependencies": { "@types/node": "^25.5.0", - "oxfmt": "^0.40.0", - "oxlint": "^1.55.0", + "oxfmt": "^0.41.0", + "oxlint": "^1.56.0", "picocolors": "^1.1.1", - "tsdown": "^0.21.2", + "tsdown": "^0.21.4", "typescript": "^5.9.3", }, "peerDependencies": { @@ -50,81 +50,81 @@ "@oxc-project/types": ["@oxc-project/types@0.115.0", "", {}, "sha512-4n91DKnebUS4yjUHl2g3/b2T+IUdCfmoZGhmwsovZCDaJSs+QkVAM+0AqqTxHSsHfeiMuueT75cZaZcT/m0pSw=="], - "@oxfmt/binding-android-arm-eabi": ["@oxfmt/binding-android-arm-eabi@0.40.0", "", { "os": "android", "cpu": "arm" }, "sha512-S6zd5r1w/HmqR8t0CTnGjFTBLDq2QKORPwriCHxo4xFNuhmOTABGjPaNvCJJVnrKBLsohOeiDX3YqQfJPF+FXw=="], + "@oxfmt/binding-android-arm-eabi": ["@oxfmt/binding-android-arm-eabi@0.41.0", "", { "os": "android", "cpu": "arm" }, "sha512-REfrqeMKGkfMP+m/ScX4f5jJBSmVNYcpoDF8vP8f8eYPDuPGZmzp56NIUsYmx3h7f6NzC6cE3gqh8GDWrJHCKw=="], - "@oxfmt/binding-android-arm64": ["@oxfmt/binding-android-arm64@0.40.0", "", { "os": "android", "cpu": "arm64" }, "sha512-/mbS9UUP/5Vbl2D6osIdcYiP0oie63LKMoTyGj5hyMCK/SFkl3EhtyRAfdjPvuvHC0SXdW6ePaTKkBSq1SNcIw=="], + "@oxfmt/binding-android-arm64": ["@oxfmt/binding-android-arm64@0.41.0", "", { "os": "android", "cpu": "arm64" }, "sha512-s0b1dxNgb2KomspFV2LfogC2XtSJB42POXF4bMCLJyvQmAGos4ZtjGPfQreToQEaY0FQFjz3030ggI36rF1q5g=="], - "@oxfmt/binding-darwin-arm64": ["@oxfmt/binding-darwin-arm64@0.40.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-wRt8fRdfLiEhnRMBonlIbKrJWixoEmn6KCjKE9PElnrSDSXETGZfPb8ee+nQNTobXkCVvVLytp2o0obAsxl78Q=="], + "@oxfmt/binding-darwin-arm64": ["@oxfmt/binding-darwin-arm64@0.41.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-EGXGualADbv/ZmamE7/2DbsrYmjoPlAmHEpTL4vapLF4EfVD6fr8/uQDFnPJkUBjiSWFJZtFNsGeN1B6V3owmA=="], - "@oxfmt/binding-darwin-x64": ["@oxfmt/binding-darwin-x64@0.40.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-fzowhqbOE/NRy+AE5ob0+Y4X243WbWzDb00W+pKwD7d9tOqsAFbtWUwIyqqCoCLxj791m2xXIEeLH/3uz7zCCg=="], + "@oxfmt/binding-darwin-x64": ["@oxfmt/binding-darwin-x64@0.41.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-WxySJEvdQQYMmyvISH3qDpTvoS0ebnIP63IMxLLWowJyPp/AAH0hdWtlo+iGNK5y3eVfa5jZguwNaQkDKWpGSw=="], - "@oxfmt/binding-freebsd-x64": ["@oxfmt/binding-freebsd-x64@0.40.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-agZ9ITaqdBjcerRRFEHB8s0OyVcQW8F9ZxsszjxzeSthQ4fcN2MuOtQFWec1ed8/lDa50jSLHVE2/xPmTgtCfQ=="], + "@oxfmt/binding-freebsd-x64": ["@oxfmt/binding-freebsd-x64@0.41.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-Y2kzMkv3U3oyuYaR4wTfGjOTYTXiFC/hXmG0yVASKkbh02BJkvD98Ij8bIevr45hNZ0DmZEgqiXF+9buD4yMYQ=="], - "@oxfmt/binding-linux-arm-gnueabihf": ["@oxfmt/binding-linux-arm-gnueabihf@0.40.0", "", { "os": "linux", "cpu": "arm" }, "sha512-ZM2oQ47p28TP1DVIp7HL1QoMUgqlBFHey0ksHct7tMXoU5BqjNvPWw7888azzMt25lnyPODVuye1wvNbvVUFOA=="], + "@oxfmt/binding-linux-arm-gnueabihf": ["@oxfmt/binding-linux-arm-gnueabihf@0.41.0", "", { "os": "linux", "cpu": "arm" }, "sha512-ptazDjdUyhket01IjPTT6ULS1KFuBfTUU97osTP96X5y/0oso+AgAaJzuH81oP0+XXyrWIHbRzozSAuQm4p48g=="], - "@oxfmt/binding-linux-arm-musleabihf": ["@oxfmt/binding-linux-arm-musleabihf@0.40.0", "", { "os": "linux", "cpu": "arm" }, "sha512-RBFPAxRAIsMisKM47Oe6Lwdv6agZYLz02CUhVCD1sOv5ajAcRMrnwCFBPWwGXpazToW2mjnZxFos8TuFjTU15A=="], + "@oxfmt/binding-linux-arm-musleabihf": ["@oxfmt/binding-linux-arm-musleabihf@0.41.0", "", { "os": "linux", "cpu": "arm" }, "sha512-UkoL2OKxFD+56bPEBcdGn+4juTW4HRv/T6w1dIDLnvKKWr6DbarB/mtHXlADKlFiJubJz8pRkttOR7qjYR6lTA=="], - "@oxfmt/binding-linux-arm64-gnu": ["@oxfmt/binding-linux-arm64-gnu@0.40.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-Nb2XbQ+wV3W2jSIihXdPj7k83eOxeSgYP3N/SRXvQ6ZYPIk6Q86qEh5Gl/7OitX3bQoQrESqm1yMLvZV8/J7dA=="], + "@oxfmt/binding-linux-arm64-gnu": ["@oxfmt/binding-linux-arm64-gnu@0.41.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-gofu0PuumSOHYczD8p62CPY4UF6ee+rSLZJdUXkpwxg6pILiwSDBIouPskjF/5nF3A7QZTz2O9KFNkNxxFN9tA=="], - "@oxfmt/binding-linux-arm64-musl": ["@oxfmt/binding-linux-arm64-musl@0.40.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-tGmWhLD/0YMotCdfezlT6tC/MJG/wKpo4vnQ3Cq+4eBk/BwNv7EmkD0VkD5F/dYkT3b8FNU01X2e8vvJuWoM1w=="], + "@oxfmt/binding-linux-arm64-musl": ["@oxfmt/binding-linux-arm64-musl@0.41.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-VfVZxL0+6RU86T8F8vKiDBa+iHsr8PAjQmKGBzSCAX70b6x+UOMFl+2dNihmKmUwqkCazCPfYjt6SuAPOeQJ3g=="], - "@oxfmt/binding-linux-ppc64-gnu": ["@oxfmt/binding-linux-ppc64-gnu@0.40.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-rVbFyM3e7YhkVnp0IVYjaSHfrBWcTRWb60LEcdNAJcE2mbhTpbqKufx0FrhWfoxOrW/+7UJonAOShoFFLigDqQ=="], + "@oxfmt/binding-linux-ppc64-gnu": ["@oxfmt/binding-linux-ppc64-gnu@0.41.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-bwzokz2eGvdfJbc0i+zXMJ4BBjQPqg13jyWpEEZDOrBCQ91r8KeY2Mi2kUeuMTZNFXju+jcAbAbpyJxRGla0eg=="], - "@oxfmt/binding-linux-riscv64-gnu": ["@oxfmt/binding-linux-riscv64-gnu@0.40.0", "", { "os": "linux", "cpu": "none" }, "sha512-3ZqBw14JtWeEoLiioJcXSJz8RQyPE+3jLARnYM1HdPzZG4vk+Ua8CUupt2+d+vSAvMyaQBTN2dZK+kbBS/j5mA=="], + "@oxfmt/binding-linux-riscv64-gnu": ["@oxfmt/binding-linux-riscv64-gnu@0.41.0", "", { "os": "linux", "cpu": "none" }, "sha512-POLM//PCH9uqDeNDwWL3b3DkMmI3oI2cU6hwc2lnztD1o7dzrQs3R9nq555BZ6wI7t2lyhT9CS+CRaz5X0XqLA=="], - "@oxfmt/binding-linux-riscv64-musl": ["@oxfmt/binding-linux-riscv64-musl@0.40.0", "", { "os": "linux", "cpu": "none" }, "sha512-JJ4PPSdcbGBjPvb+O7xYm2FmAsKCyuEMYhqatBAHMp/6TA6rVlf9Z/sYPa4/3Bommb+8nndm15SPFRHEPU5qFA=="], + "@oxfmt/binding-linux-riscv64-musl": ["@oxfmt/binding-linux-riscv64-musl@0.41.0", "", { "os": "linux", "cpu": "none" }, "sha512-NNK7PzhFqLUwx/G12Xtm6scGv7UITvyGdAR5Y+TlqsG+essnuRWR4jRNODWRjzLZod0T3SayRbnkSIWMBov33w=="], - "@oxfmt/binding-linux-s390x-gnu": ["@oxfmt/binding-linux-s390x-gnu@0.40.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-Kp0zNJoX9Ik77wUya2tpBY3W9f40VUoMQLWVaob5SgCrblH/t2xr/9B2bWHfs0WCefuGmqXcB+t0Lq77sbBmZw=="], + "@oxfmt/binding-linux-s390x-gnu": ["@oxfmt/binding-linux-s390x-gnu@0.41.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-qVf/zDC5cN9eKe4qI/O/m445er1IRl6swsSl7jHkqmOSVfknwCe5JXitYjZca+V/cNJSU/xPlC5EFMabMMFDpw=="], - "@oxfmt/binding-linux-x64-gnu": ["@oxfmt/binding-linux-x64-gnu@0.40.0", "", { "os": "linux", "cpu": "x64" }, "sha512-7YTCNzleWTaQTqNGUNQ66qVjpoV6DjbCOea+RnpMBly2bpzrI/uu7Rr+2zcgRfNxyjXaFTVQKaRKjqVdeUfeVA=="], + "@oxfmt/binding-linux-x64-gnu": ["@oxfmt/binding-linux-x64-gnu@0.41.0", "", { "os": "linux", "cpu": "x64" }, "sha512-ojxYWu7vUb6ysYqVCPHuAPVZHAI40gfZ0PDtZAMwVmh2f0V8ExpPIKoAKr7/8sNbAXJBBpZhs2coypIo2jJX4w=="], - "@oxfmt/binding-linux-x64-musl": ["@oxfmt/binding-linux-x64-musl@0.40.0", "", { "os": "linux", "cpu": "x64" }, "sha512-hWnSzJ0oegeOwfOEeejYXfBqmnRGHusgtHfCPzmvJvHTwy1s3Neo59UKc1CmpE3zxvrCzJoVHos0rr97GHMNPw=="], + "@oxfmt/binding-linux-x64-musl": ["@oxfmt/binding-linux-x64-musl@0.41.0", "", { "os": "linux", "cpu": "x64" }, "sha512-O2exZLBxoCMIv2vlvcbkdedazJPTdG0VSup+0QUCfYQtx751zCZNboX2ZUOiQ/gDTdhtXvSiot0h6GEGkOyalA=="], - "@oxfmt/binding-openharmony-arm64": ["@oxfmt/binding-openharmony-arm64@0.40.0", "", { "os": "none", "cpu": "arm64" }, "sha512-28sJC1lR4qtBJGzSRRbPnSW3GxU2+4YyQFE6rCmsUYqZ5XYH8jg0/w+CvEzQ8TuAQz5zLkcA25nFQGwoU0PT3Q=="], + "@oxfmt/binding-openharmony-arm64": ["@oxfmt/binding-openharmony-arm64@0.41.0", "", { "os": "none", "cpu": "arm64" }, "sha512-N+31/VoL+z+NNBt8viy3I4NaIdPbiYeOnB884LKqvXldaE2dRztdPv3q5ipfZYv0RwFp7JfqS4I27K/DSHCakg=="], - "@oxfmt/binding-win32-arm64-msvc": ["@oxfmt/binding-win32-arm64-msvc@0.40.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-cDkRnyT0dqwF5oIX1Cv59HKCeZQFbWWdUpXa3uvnHFT2iwYSSZspkhgjXjU6iDp5pFPaAEAe9FIbMoTgkTmKPg=="], + "@oxfmt/binding-win32-arm64-msvc": ["@oxfmt/binding-win32-arm64-msvc@0.41.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-Z7NAtu/RN8kjCQ1y5oDD0nTAeRswh3GJ93qwcW51srmidP7XPBmZbLlwERu1W5veCevQJtPS9xmkpcDTYsGIwQ=="], - "@oxfmt/binding-win32-ia32-msvc": ["@oxfmt/binding-win32-ia32-msvc@0.40.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-7rPemBJjqm5Gkv6ZRCPvK8lE6AqQ/2z31DRdWazyx2ZvaSgL7QGofHXHNouRpPvNsT9yxRNQJgigsWkc+0qg4w=="], + "@oxfmt/binding-win32-ia32-msvc": ["@oxfmt/binding-win32-ia32-msvc@0.41.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-uNxxP3l4bJ6VyzIeRqCmBU2Q0SkCFgIhvx9/9dJ9V8t/v+jP1IBsuaLwCXGR8JPHtkj4tFp+RHtUmU2ZYAUpMA=="], - "@oxfmt/binding-win32-x64-msvc": ["@oxfmt/binding-win32-x64-msvc@0.40.0", "", { "os": "win32", "cpu": "x64" }, "sha512-/Zmj0yTYSvmha6TG1QnoLqVT7ZMRDqXvFXXBQpIjteEwx9qvUYMBH2xbiOFhDeMUJkGwC3D6fdKsFtaqUvkwNA=="], + "@oxfmt/binding-win32-x64-msvc": ["@oxfmt/binding-win32-x64-msvc@0.41.0", "", { "os": "win32", "cpu": "x64" }, "sha512-49ZSpbZ1noozyPapE8SUOSm3IN0Ze4b5nkO+4+7fq6oEYQQJFhE0saj5k/Gg4oewVPdjn0L3ZFeWk2Vehjcw7A=="], - "@oxlint/binding-android-arm-eabi": ["@oxlint/binding-android-arm-eabi@1.55.0", "", { "os": "android", "cpu": "arm" }, "sha512-NhvgAhncTSOhRahQSCnkK/4YIGPjTmhPurQQ2dwt2IvwCMTvZRW5vF2K10UBOxFve4GZDMw6LtXZdC2qeuYIVQ=="], + "@oxlint/binding-android-arm-eabi": ["@oxlint/binding-android-arm-eabi@1.56.0", "", { "os": "android", "cpu": "arm" }, "sha512-IyfYPthZyiSKwAv/dLjeO18SaK8MxLI9Yss2JrRDyweQAkuL3LhEy7pwIwI7uA3KQc1Vdn20kdmj3q0oUIQL6A=="], - "@oxlint/binding-android-arm64": ["@oxlint/binding-android-arm64@1.55.0", "", { "os": "android", "cpu": "arm64" }, "sha512-P9iWRh+Ugqhg+D7rkc7boHX8o3H2h7YPcZHQIgvVBgnua5tk4LR2L+IBlreZs58/95cd2x3/004p5VsQM9z4SA=="], + "@oxlint/binding-android-arm64": ["@oxlint/binding-android-arm64@1.56.0", "", { "os": "android", "cpu": "arm64" }, "sha512-Ga5zYrzH6vc/VFxhn6MmyUnYEfy9vRpwTIks99mY3j6Nz30yYpIkWryI0QKPCgvGUtDSXVLEaMum5nA+WrNOSg=="], - "@oxlint/binding-darwin-arm64": ["@oxlint/binding-darwin-arm64@1.55.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-esakkJIt7WFAhT30P/Qzn96ehFpzdZ1mNuzpOb8SCW7lI4oB8VsyQnkSHREM671jfpuBb/o2ppzBCx5l0jpgMA=="], + "@oxlint/binding-darwin-arm64": ["@oxlint/binding-darwin-arm64@1.56.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-ogmbdJysnw/D4bDcpf1sPLpFThZ48lYp4aKYm10Z/6Nh1SON6NtnNhTNOlhEY296tDFItsZUz+2tgcSYqh8Eyw=="], - "@oxlint/binding-darwin-x64": ["@oxlint/binding-darwin-x64@1.55.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-xDMFRCCAEK9fOH6As2z8ELsC+VDGSFRHwIKVSilw+xhgLwTDFu37rtmRbmUlx8rRGS6cWKQPTc47AVxAZEVVPQ=="], + "@oxlint/binding-darwin-x64": ["@oxlint/binding-darwin-x64@1.56.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-x8QE1h+RAtQ2g+3KPsP6Fk/tdz6zJQUv5c7fTrJxXV3GHOo+Ry5p/PsogU4U+iUZg0rj6hS+E4xi+mnwwlDCWQ=="], - "@oxlint/binding-freebsd-x64": ["@oxlint/binding-freebsd-x64@1.55.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-mYZqnwUD7ALCRxGenyLd1uuG+rHCL+OTT6S8FcAbVm/ZT2AZMGjvibp3F6k1SKOb2aeqFATmwRykrE41Q0GWVw=="], + "@oxlint/binding-freebsd-x64": ["@oxlint/binding-freebsd-x64@1.56.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-6G+WMZvwJpMvY7my+/SHEjb7BTk/PFbePqLpmVmUJRIsJMy/UlyYqjpuh0RCgYYkPLcnXm1rUM04kbTk8yS1Yg=="], - "@oxlint/binding-linux-arm-gnueabihf": ["@oxlint/binding-linux-arm-gnueabihf@1.55.0", "", { "os": "linux", "cpu": "arm" }, "sha512-LcX6RYcF9vL9ESGwJW3yyIZ/d/ouzdOKXxCdey1q0XJOW1asrHsIg5MmyKdEBR4plQx+shvYeQne7AzW5f3T1w=="], + "@oxlint/binding-linux-arm-gnueabihf": ["@oxlint/binding-linux-arm-gnueabihf@1.56.0", "", { "os": "linux", "cpu": "arm" }, "sha512-YYHBsk/sl7fYwQOok+6W5lBPeUEvisznV/HZD2IfZmF3Bns6cPC3Z0vCtSEOaAWTjYWN3jVsdu55jMxKlsdlhg=="], - "@oxlint/binding-linux-arm-musleabihf": ["@oxlint/binding-linux-arm-musleabihf@1.55.0", "", { "os": "linux", "cpu": "arm" }, "sha512-C+8GS1rPtK+dI7mJFkqoRBkDuqbrNihnyYQsJPS9ez+8zF9JzfvU19lawqt4l/Y23o5uQswE/DORa8aiXUih3w=="], + "@oxlint/binding-linux-arm-musleabihf": ["@oxlint/binding-linux-arm-musleabihf@1.56.0", "", { "os": "linux", "cpu": "arm" }, "sha512-+AZK8rOUr78y8WT6XkDb04IbMRqauNV+vgT6f8ZLOH8wnpQ9i7Nol0XLxAu+Cq7Sb+J9wC0j6Km5hG8rj47/yQ=="], - "@oxlint/binding-linux-arm64-gnu": ["@oxlint/binding-linux-arm64-gnu@1.55.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-ErLE4XbmcCopA4/CIDiH6J1IAaDOMnf/KSx/aFObs4/OjAAM3sFKWGZ57pNOMxhhyBdcmcXwYymph9GwcpcqgQ=="], + "@oxlint/binding-linux-arm64-gnu": ["@oxlint/binding-linux-arm64-gnu@1.56.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-urse2SnugwJRojUkGSSeH2LPMaje5Q50yQtvtL9HFckiyeqXzoFwOAZqD5TR29R2lq7UHidfFDM9EGcchcbb8A=="], - "@oxlint/binding-linux-arm64-musl": ["@oxlint/binding-linux-arm64-musl@1.55.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-/kp65avi6zZfqEng56TTuhiy3P/3pgklKIdf38yvYeJ9/PgEeRA2A2AqKAKbZBNAqUzrzHhz9jF6j/PZvhJzTQ=="], + "@oxlint/binding-linux-arm64-musl": ["@oxlint/binding-linux-arm64-musl@1.56.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-rkTZkBfJ4TYLjansjSzL6mgZOdN5IvUnSq3oNJSLwBcNvy3dlgQtpHPrRxrCEbbcp7oQ6If0tkNaqfOsphYZ9g=="], - "@oxlint/binding-linux-ppc64-gnu": ["@oxlint/binding-linux-ppc64-gnu@1.55.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-A6pTdXwcEEwL/nmz0eUJ6WxmxcoIS+97GbH96gikAyre3s5deC7sts38ZVVowjS2QQFuSWkpA4ZmQC0jZSNvJQ=="], + "@oxlint/binding-linux-ppc64-gnu": ["@oxlint/binding-linux-ppc64-gnu@1.56.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-uqL1kMH3u69/e1CH2EJhP3CP28jw2ExLsku4o8RVAZ7fySo9zOyI2fy9pVlTAp4voBLVgzndXi3SgtdyCTa2aA=="], - "@oxlint/binding-linux-riscv64-gnu": ["@oxlint/binding-linux-riscv64-gnu@1.55.0", "", { "os": "linux", "cpu": "none" }, "sha512-clj0lnIN+V52G9tdtZl0LbdTSurnZ1NZj92Je5X4lC7gP5jiCSW+Y/oiDiSauBAD4wrHt2S7nN3pA0zfKYK/6Q=="], + "@oxlint/binding-linux-riscv64-gnu": ["@oxlint/binding-linux-riscv64-gnu@1.56.0", "", { "os": "linux", "cpu": "none" }, "sha512-j0CcMBOgV6KsRaBdsebIeiy7hCjEvq2KdEsiULf2LZqAq0v1M1lWjelhCV57LxsqaIGChXFuFJ0RiFrSRHPhSg=="], - "@oxlint/binding-linux-riscv64-musl": ["@oxlint/binding-linux-riscv64-musl@1.55.0", "", { "os": "linux", "cpu": "none" }, "sha512-NNu08pllN5x/O94/sgR3DA8lbrGBnTHsINZZR0hcav1sj79ksTiKKm1mRzvZvacwQ0hUnGinFo+JO75ok2PxYg=="], + "@oxlint/binding-linux-riscv64-musl": ["@oxlint/binding-linux-riscv64-musl@1.56.0", "", { "os": "linux", "cpu": "none" }, "sha512-7VDOiL8cDG3DQ/CY3yKjbV1c4YPvc4vH8qW09Vv+5ukq3l/Kcyr6XGCd5NvxUmxqDb2vjMpM+eW/4JrEEsUetA=="], - "@oxlint/binding-linux-s390x-gnu": ["@oxlint/binding-linux-s390x-gnu@1.55.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-BvfQz3PRlWZRoEZ17dZCqgQsMRdpzGZomJkVATwCIGhHVVeHJMQdmdXPSjcT1DCNUrOjXnVyj1RGDj5+/Je2+Q=="], + "@oxlint/binding-linux-s390x-gnu": ["@oxlint/binding-linux-s390x-gnu@1.56.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-JGRpX0M+ikD3WpwJ7vKcHKV6Kg0dT52BW2Eu2BupXotYeqGXBrbY+QPkAyKO6MNgKozyTNaRh3r7g+VWgyAQYQ=="], - "@oxlint/binding-linux-x64-gnu": ["@oxlint/binding-linux-x64-gnu@1.55.0", "", { "os": "linux", "cpu": "x64" }, "sha512-ngSOoFCSBMKVQd24H8zkbcBNc7EHhjnF1sv3mC9NNXQ/4rRjI/4Dj9+9XoDZeFEkF1SX1COSBXF1b2Pr9rqdEw=="], + "@oxlint/binding-linux-x64-gnu": ["@oxlint/binding-linux-x64-gnu@1.56.0", "", { "os": "linux", "cpu": "x64" }, "sha512-dNaICPvtmuxFP/VbqdofrLqdS3bM/AKJN3LMJD52si44ea7Be1cBk6NpfIahaysG9Uo+L98QKddU9CD5L8UHnQ=="], - "@oxlint/binding-linux-x64-musl": ["@oxlint/binding-linux-x64-musl@1.55.0", "", { "os": "linux", "cpu": "x64" }, "sha512-BDpP7W8GlaG7BR6QjGZAleYzxoyKc/D24spZIF2mB3XsfALQJJT/OBmP8YpeTb1rveFSBHzl8T7l0aqwkWNdGA=="], + "@oxlint/binding-linux-x64-musl": ["@oxlint/binding-linux-x64-musl@1.56.0", "", { "os": "linux", "cpu": "x64" }, "sha512-pF1vOtM+GuXmbklM1hV8WMsn6tCNPvkUzklj/Ej98JhlanbmA2RB1BILgOpwSuCTRTIYx2MXssmEyQQ90QF5aA=="], - "@oxlint/binding-openharmony-arm64": ["@oxlint/binding-openharmony-arm64@1.55.0", "", { "os": "none", "cpu": "arm64" }, "sha512-PS6GFvmde/pc3fCA2Srt51glr8Lcxhpf6WIBFfLphndjRrD34NEcses4TSxQrEcxYo6qVywGfylM0ZhSCF2gGA=="], + "@oxlint/binding-openharmony-arm64": ["@oxlint/binding-openharmony-arm64@1.56.0", "", { "os": "none", "cpu": "arm64" }, "sha512-bp8NQ4RE6fDIFLa4bdBiOA+TAvkNkg+rslR+AvvjlLTYXLy9/uKAYLQudaQouWihLD/hgkrXIKKzXi5IXOewwg=="], - "@oxlint/binding-win32-arm64-msvc": ["@oxlint/binding-win32-arm64-msvc@1.55.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-P6JcLJGs/q1UOvDLzN8otd9JsH4tsuuPDv+p7aHqHM3PrKmYdmUvkNj4K327PTd35AYcznOCN+l4ZOaq76QzSw=="], + "@oxlint/binding-win32-arm64-msvc": ["@oxlint/binding-win32-arm64-msvc@1.56.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-PxT4OJDfMOQBzo3OlzFb9gkoSD+n8qSBxyVq2wQSZIHFQYGEqIRTo9M0ZStvZm5fdhMqaVYpOnJvH2hUMEDk/g=="], - "@oxlint/binding-win32-ia32-msvc": ["@oxlint/binding-win32-ia32-msvc@1.55.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-gzkk4zE2zsE+WmRxFOiAZHpCpUNDFytEakqNXoNHW+PnYEOTPKDdW6nrzgSeTbGKVPXNAKQnRnMgrh7+n3Xueg=="], + "@oxlint/binding-win32-ia32-msvc": ["@oxlint/binding-win32-ia32-msvc@1.56.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-PTRy6sIEPqy2x8PTP1baBNReN/BNEFmde0L+mYeHmjXE1Vlcc9+I5nsqENsB2yAm5wLkzPoTNCMY/7AnabT4/A=="], - "@oxlint/binding-win32-x64-msvc": ["@oxlint/binding-win32-x64-msvc@1.55.0", "", { "os": "win32", "cpu": "x64" }, "sha512-ZFALNow2/og75gvYzNP7qe+rREQ5xunktwA+lgykoozHZ6hw9bqg4fn5j2UvG4gIn1FXqrZHkOAXuPf5+GOYTQ=="], + "@oxlint/binding-win32-x64-msvc": ["@oxlint/binding-win32-x64-msvc@1.56.0", "", { "os": "win32", "cpu": "x64" }, "sha512-ZHa0clocjLmIDr+1LwoWtxRcoYniAvERotvwKUYKhH41NVfl0Y4LNbyQkwMZzwDvKklKGvGZ5+DAG58/Ik47tQ=="], "@quansync/fs": ["@quansync/fs@1.0.0", "", { "dependencies": { "quansync": "^1.0.0" } }, "sha512-4TJ3DFtlf1L5LDMaM6CanJ/0lckGNtJcMjQ1NAV6zDmA0tEHKZtxNKin8EgPaVX1YzljbxckyT2tJrpQKAtngQ=="], @@ -190,7 +190,7 @@ "get-tsconfig": ["get-tsconfig@4.13.6", "", { "dependencies": { "resolve-pkg-maps": "^1.0.0" } }, "sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw=="], - "hookable": ["hookable@6.0.1", "", {}, "sha512-uKGyY8BuzN/a5gvzvA+3FVWo0+wUjgtfSdnmjtrOVwQCZPHpHDH2WRO3VZSOeluYrHoDCiXFffZXs8Dj1ULWtw=="], + "hookable": ["hookable@6.1.0", "", {}, "sha512-ZoKZSJgu8voGK2geJS+6YtYjvIzu9AOM/KZXsBxr83uhLL++e9pEv/dlgwgy3dvHg06kTz6JOh1hk3C8Ceiymw=="], "import-without-cache": ["import-without-cache@0.2.5", "", {}, "sha512-B6Lc2s6yApwnD2/pMzFh/d5AVjdsDXjgkeJ766FmFuJELIGHNycKRj+l3A39yZPM4CchqNCB4RITEAYB1KUM6A=="], @@ -198,9 +198,9 @@ "obug": ["obug@2.1.1", "", {}, "sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ=="], - "oxfmt": ["oxfmt@0.40.0", "", { "dependencies": { "tinypool": "2.1.0" }, "optionalDependencies": { "@oxfmt/binding-android-arm-eabi": "0.40.0", "@oxfmt/binding-android-arm64": "0.40.0", "@oxfmt/binding-darwin-arm64": "0.40.0", "@oxfmt/binding-darwin-x64": "0.40.0", "@oxfmt/binding-freebsd-x64": "0.40.0", "@oxfmt/binding-linux-arm-gnueabihf": "0.40.0", "@oxfmt/binding-linux-arm-musleabihf": "0.40.0", "@oxfmt/binding-linux-arm64-gnu": "0.40.0", "@oxfmt/binding-linux-arm64-musl": "0.40.0", "@oxfmt/binding-linux-ppc64-gnu": "0.40.0", "@oxfmt/binding-linux-riscv64-gnu": "0.40.0", "@oxfmt/binding-linux-riscv64-musl": "0.40.0", "@oxfmt/binding-linux-s390x-gnu": "0.40.0", "@oxfmt/binding-linux-x64-gnu": "0.40.0", "@oxfmt/binding-linux-x64-musl": "0.40.0", "@oxfmt/binding-openharmony-arm64": "0.40.0", "@oxfmt/binding-win32-arm64-msvc": "0.40.0", "@oxfmt/binding-win32-ia32-msvc": "0.40.0", "@oxfmt/binding-win32-x64-msvc": "0.40.0" }, "bin": { "oxfmt": "bin/oxfmt" } }, "sha512-g0C3I7xUj4b4DcagevM9kgH6+pUHytikxUcn3/VUkvzTNaaXBeyZqb7IBsHwojeXm4mTBEC/aBjBTMVUkZwWUQ=="], + "oxfmt": ["oxfmt@0.41.0", "", { "dependencies": { "tinypool": "2.1.0" }, "optionalDependencies": { "@oxfmt/binding-android-arm-eabi": "0.41.0", "@oxfmt/binding-android-arm64": "0.41.0", "@oxfmt/binding-darwin-arm64": "0.41.0", "@oxfmt/binding-darwin-x64": "0.41.0", "@oxfmt/binding-freebsd-x64": "0.41.0", "@oxfmt/binding-linux-arm-gnueabihf": "0.41.0", "@oxfmt/binding-linux-arm-musleabihf": "0.41.0", "@oxfmt/binding-linux-arm64-gnu": "0.41.0", "@oxfmt/binding-linux-arm64-musl": "0.41.0", "@oxfmt/binding-linux-ppc64-gnu": "0.41.0", "@oxfmt/binding-linux-riscv64-gnu": "0.41.0", "@oxfmt/binding-linux-riscv64-musl": "0.41.0", "@oxfmt/binding-linux-s390x-gnu": "0.41.0", "@oxfmt/binding-linux-x64-gnu": "0.41.0", "@oxfmt/binding-linux-x64-musl": "0.41.0", "@oxfmt/binding-openharmony-arm64": "0.41.0", "@oxfmt/binding-win32-arm64-msvc": "0.41.0", "@oxfmt/binding-win32-ia32-msvc": "0.41.0", "@oxfmt/binding-win32-x64-msvc": "0.41.0" }, "bin": { "oxfmt": "bin/oxfmt" } }, "sha512-sKLdJZdQ3bw6x9qKiT7+eID4MNEXlDHf5ZacfIircrq6Qwjk0L6t2/JQlZZrVHTXJawK3KaMuBoJnEJPcqCEdg=="], - "oxlint": ["oxlint@1.55.0", "", { "optionalDependencies": { "@oxlint/binding-android-arm-eabi": "1.55.0", "@oxlint/binding-android-arm64": "1.55.0", "@oxlint/binding-darwin-arm64": "1.55.0", "@oxlint/binding-darwin-x64": "1.55.0", "@oxlint/binding-freebsd-x64": "1.55.0", "@oxlint/binding-linux-arm-gnueabihf": "1.55.0", "@oxlint/binding-linux-arm-musleabihf": "1.55.0", "@oxlint/binding-linux-arm64-gnu": "1.55.0", "@oxlint/binding-linux-arm64-musl": "1.55.0", "@oxlint/binding-linux-ppc64-gnu": "1.55.0", "@oxlint/binding-linux-riscv64-gnu": "1.55.0", "@oxlint/binding-linux-riscv64-musl": "1.55.0", "@oxlint/binding-linux-s390x-gnu": "1.55.0", "@oxlint/binding-linux-x64-gnu": "1.55.0", "@oxlint/binding-linux-x64-musl": "1.55.0", "@oxlint/binding-openharmony-arm64": "1.55.0", "@oxlint/binding-win32-arm64-msvc": "1.55.0", "@oxlint/binding-win32-ia32-msvc": "1.55.0", "@oxlint/binding-win32-x64-msvc": "1.55.0" }, "peerDependencies": { "oxlint-tsgolint": ">=0.15.0" }, "optionalPeers": ["oxlint-tsgolint"], "bin": { "oxlint": "bin/oxlint" } }, "sha512-T+FjepiyWpaZMhekqRpH8Z3I4vNM610p6w+Vjfqgj5TZUxHXl7N8N5IPvmOU8U4XdTRxqtNNTh9Y4hLtr7yvFg=="], + "oxlint": ["oxlint@1.56.0", "", { "optionalDependencies": { "@oxlint/binding-android-arm-eabi": "1.56.0", "@oxlint/binding-android-arm64": "1.56.0", "@oxlint/binding-darwin-arm64": "1.56.0", "@oxlint/binding-darwin-x64": "1.56.0", "@oxlint/binding-freebsd-x64": "1.56.0", "@oxlint/binding-linux-arm-gnueabihf": "1.56.0", "@oxlint/binding-linux-arm-musleabihf": "1.56.0", "@oxlint/binding-linux-arm64-gnu": "1.56.0", "@oxlint/binding-linux-arm64-musl": "1.56.0", "@oxlint/binding-linux-ppc64-gnu": "1.56.0", "@oxlint/binding-linux-riscv64-gnu": "1.56.0", "@oxlint/binding-linux-riscv64-musl": "1.56.0", "@oxlint/binding-linux-s390x-gnu": "1.56.0", "@oxlint/binding-linux-x64-gnu": "1.56.0", "@oxlint/binding-linux-x64-musl": "1.56.0", "@oxlint/binding-openharmony-arm64": "1.56.0", "@oxlint/binding-win32-arm64-msvc": "1.56.0", "@oxlint/binding-win32-ia32-msvc": "1.56.0", "@oxlint/binding-win32-x64-msvc": "1.56.0" }, "peerDependencies": { "oxlint-tsgolint": ">=0.15.0" }, "optionalPeers": ["oxlint-tsgolint"], "bin": { "oxlint": "bin/oxlint" } }, "sha512-Q+5Mj5PVaH/R6/fhMMFzw4dT+KPB+kQW4kaL8FOIq7tfhlnEVp6+3lcWqFruuTNlUo9srZUW3qH7Id4pskeR6g=="], "pathe": ["pathe@2.0.3", "", {}, "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w=="], @@ -218,7 +218,7 @@ "semver": ["semver@7.7.4", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA=="], - "tinyexec": ["tinyexec@1.0.2", "", {}, "sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg=="], + "tinyexec": ["tinyexec@1.0.4", "", {}, "sha512-u9r3uZC0bdpGOXtlxUIdwf9pkmvhqJdrVCH9fapQtgy/OeTTMZ1nqH7agtvEfmGui6e1XxjcdrlxvxJvc3sMqw=="], "tinyglobby": ["tinyglobby@0.2.15", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.3" } }, "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ=="], @@ -226,7 +226,7 @@ "tree-kill": ["tree-kill@1.2.2", "", { "bin": { "tree-kill": "cli.js" } }, "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A=="], - "tsdown": ["tsdown@0.21.2", "", { "dependencies": { "ansis": "^4.2.0", "cac": "^7.0.0", "defu": "^6.1.4", "empathic": "^2.0.0", "hookable": "^6.0.1", "import-without-cache": "^0.2.5", "obug": "^2.1.1", "picomatch": "^4.0.3", "rolldown": "1.0.0-rc.9", "rolldown-plugin-dts": "^0.22.5", "semver": "^7.7.4", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "tree-kill": "^1.2.2", "unconfig-core": "^7.5.0", "unrun": "^0.2.32" }, "peerDependencies": { "@arethetypeswrong/core": "^0.18.1", "@tsdown/css": "0.21.2", "@tsdown/exe": "0.21.2", "@vitejs/devtools": "*", "publint": "^0.3.0", "typescript": "^5.0.0", "unplugin-unused": "^0.5.0" }, "optionalPeers": ["@arethetypeswrong/core", "@tsdown/css", "@tsdown/exe", "@vitejs/devtools", "publint", "typescript", "unplugin-unused"], "bin": { "tsdown": "dist/run.mjs" } }, "sha512-pP8eAcd1XAWjl5gjosuJs0BAuVoheUe3V8VDHx31QK7YOgXjcCMsBSyFWO3CMh/CSUkjRUzR96JtGH3WJFTExQ=="], + "tsdown": ["tsdown@0.21.4", "", { "dependencies": { "ansis": "^4.2.0", "cac": "^7.0.0", "defu": "^6.1.4", "empathic": "^2.0.0", "hookable": "^6.1.0", "import-without-cache": "^0.2.5", "obug": "^2.1.1", "picomatch": "^4.0.3", "rolldown": "1.0.0-rc.9", "rolldown-plugin-dts": "^0.22.5", "semver": "^7.7.4", "tinyexec": "^1.0.4", "tinyglobby": "^0.2.15", "tree-kill": "^1.2.2", "unconfig-core": "^7.5.0", "unrun": "^0.2.32" }, "peerDependencies": { "@arethetypeswrong/core": "^0.18.1", "@tsdown/css": "0.21.4", "@tsdown/exe": "0.21.4", "@vitejs/devtools": "*", "publint": "^0.3.0", "typescript": "^5.0.0", "unplugin-unused": "^0.5.0" }, "optionalPeers": ["@arethetypeswrong/core", "@tsdown/css", "@tsdown/exe", "@vitejs/devtools", "publint", "typescript", "unplugin-unused"], "bin": { "tsdown": "dist/run.mjs" } }, "sha512-Q/kBi8SXkr4X6JI/NAZKZY1UuiEcbuXtIskL4tZCsgpDiEPM/2W6lC+OonNA31S+V3KsWedFvbFDBs23hvt+Aw=="], "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], @@ -238,7 +238,7 @@ "unrun": ["unrun@0.2.32", "", { "dependencies": { "rolldown": "1.0.0-rc.9" }, "peerDependencies": { "synckit": "^0.11.11" }, "optionalPeers": ["synckit"], "bin": { "unrun": "dist/cli.mjs" } }, "sha512-opd3z6791rf281JdByf0RdRQrpcc7WyzqittqIXodM/5meNWdTwrVxeyzbaCp4/Rgls/um14oUaif1gomO8YGg=="], - "yaml": ["yaml@2.8.2", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-mplynKqc1C2hTVYxd0PU2xQAc22TI1vShAYGksCCfxbn/dFwnHTNi1bvYsBTkhdUNtGIf5xNOg938rrSSYvS9A=="], + "yaml": ["yaml@2.8.3", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-AvbaCLOO2Otw/lW5bmh9d/WEdcDFdQp2Z2ZUH3pX9U2ihyUY0nvLv7J6TrWowklRGPYbB/IuIMfYgxaCPg5Bpg=="], "ast-kit/@babel/parser": ["@babel/parser@8.0.0-rc.1", "", { "dependencies": { "@babel/types": "^8.0.0-rc.1" }, "bin": "./bin/babel-parser.js" }, "sha512-6HyyU5l1yK/7h9Ki52i5h6mDAx4qJdiLQO4FdCyJNoB/gy3T3GGJdhQzzbZgvgZCugYBvwtQiWRt94QKedHnkA=="], diff --git a/crates/language-detector/Cargo.toml b/crates/language-detector/Cargo.toml new file mode 100644 index 0000000..8329e68 --- /dev/null +++ b/crates/language-detector/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "language-detector" +version = "0.1.0" +edition = "2021" +description = "Internal WASM-backed language detector for word-counter" +repository = "https://github.com/dev-pi2pie/word-counter" +license = "MIT" +publish = false + +[lib] +crate-type = ["cdylib", "rlib"] + +[dependencies] +serde = { version = "1", features = ["derive"] } +serde-wasm-bindgen = "0.6" +wasm-bindgen = "0.2" +whatlang = "0.16" diff --git a/crates/language-detector/LICENSE b/crates/language-detector/LICENSE new file mode 100644 index 0000000..06c4ebb --- /dev/null +++ b/crates/language-detector/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 dev-pi2pie + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/language-detector/src/lib.rs b/crates/language-detector/src/lib.rs new file mode 100644 index 0000000..c426737 --- /dev/null +++ b/crates/language-detector/src/lib.rs @@ -0,0 +1,102 @@ +use serde::Serialize; +use serde_wasm_bindgen::to_value; +use wasm_bindgen::prelude::*; +use whatlang::{detect, Lang}; + +#[derive(Serialize)] +struct DetectionResult { + lang: String, + script: String, + confidence: f64, + reliable: bool, +} + +fn lang_to_iso_639_3(lang: Lang) -> &'static str { + match lang { + Lang::Epo => "epo", + Lang::Eng => "eng", + Lang::Rus => "rus", + Lang::Cmn => "cmn", + Lang::Spa => "spa", + Lang::Por => "por", + Lang::Ita => "ita", + Lang::Ben => "ben", + Lang::Fra => "fra", + Lang::Deu => "deu", + Lang::Ukr => "ukr", + Lang::Kat => "kat", + Lang::Ara => "ara", + Lang::Hin => "hin", + Lang::Jpn => "jpn", + Lang::Heb => "heb", + Lang::Yid => "yid", + Lang::Pol => "pol", + Lang::Amh => "amh", + Lang::Jav => "jav", + Lang::Kor => "kor", + Lang::Nob => "nob", + Lang::Dan => "dan", + Lang::Swe => "swe", + Lang::Fin => "fin", + Lang::Tur => "tur", + Lang::Nld => "nld", + Lang::Hun => "hun", + Lang::Ces => "ces", + Lang::Ell => "ell", + Lang::Bul => "bul", + Lang::Bel => "bel", + Lang::Mar => "mar", + Lang::Kan => "kan", + Lang::Ron => "ron", + Lang::Slv => "slv", + Lang::Hrv => "hrv", + Lang::Srp => "srp", + Lang::Mkd => "mkd", + Lang::Lit => "lit", + Lang::Lav => "lav", + Lang::Est => "est", + Lang::Tam => "tam", + Lang::Vie => "vie", + Lang::Urd => "urd", + Lang::Tha => "tha", + Lang::Guj => "guj", + Lang::Uzb => "uzb", + Lang::Pan => "pan", + Lang::Aze => "aze", + Lang::Ind => "ind", + Lang::Tel => "tel", + Lang::Pes => "pes", + Lang::Mal => "mal", + Lang::Ori => "ori", + Lang::Mya => "mya", + Lang::Nep => "nep", + Lang::Sin => "sin", + Lang::Khm => "khm", + Lang::Tuk => "tuk", + Lang::Aka => "aka", + Lang::Zul => "zul", + Lang::Sna => "sna", + Lang::Afr => "afr", + Lang::Lat => "lat", + Lang::Slk => "slk", + Lang::Cat => "cat", + Lang::Tgl => "tgl", + Lang::Hye => "hye", + } +} + +#[wasm_bindgen] +pub fn detect_language(text: &str, _route_tag: &str) -> JsValue { + let Some(info) = detect(text) else { + return JsValue::NULL; + }; + + let result = DetectionResult { + lang: lang_to_iso_639_3(info.lang()).to_string(), + script: info.script().name().to_string(), + confidence: info.confidence(), + reliable: info.is_reliable(), + }; + + to_value(&result).unwrap_or(JsValue::NULL) +} diff --git a/docs/locale-tag-detection-notes.md b/docs/locale-tag-detection-notes.md index 3e0512e..8fb4af2 100644 --- a/docs/locale-tag-detection-notes.md +++ b/docs/locale-tag-detection-notes.md @@ -1,6 +1,7 @@ --- title: "Locale Tag Detection Notes" created-date: 2026-02-19 +modified-date: 2026-03-23 status: active agent: Codex --- @@ -11,9 +12,13 @@ Document current locale-tag detection behavior, known limits, and override flags ## Detection Model -- Detection is regex/script based (Unicode script checks), not a statistical language-ID model. +- Default detection is regex/script based (Unicode script checks), not a statistical language-ID model. - Ambiguous Latin text uses `und-Latn` unless a Latin hint is provided. - Han-script fallback uses `und-Hani` by default because regex script checks cannot natively distinguish `zh-Hans` vs `zh-Hant`. +- `--detector wasm` is an optional detector-assisted route for ambiguous chunks only. +- The first WASM detector engine is `whatlang`, remapped into this package's public tag contract. +- `--detector regex` keeps the existing chunk-first detection behavior. +- `--detector wasm` keeps the counting chunk model but uses a detector-oriented ambiguous-window scoring pass before relabeling those chunks. ## Built-in Latin Diacritic Heuristics @@ -29,6 +34,9 @@ Document current locale-tag detection behavior, known limits, and override flags ## Overrides and Inspection +- Use `--detector ` to select detection mode: + - `regex` (default) + - `wasm` - Use `--latin-language ` or `--latin-tag ` for ambiguous Latin text. - Use `--latin-hint =` (repeatable) and `--latin-hints-file ` to add custom Latin rules. - Use `--no-default-latin-hints` to disable built-in Latin diacritic rules. @@ -40,4 +48,10 @@ Document current locale-tag detection behavior, known limits, and override flags - Regex/script-only detection cannot reliably identify English vs other Latin-script languages. - Latin text with unsupported diacritic patterns may remain in `und-Latn` unless hints are provided. +- WASM detection is conservative: + - `und-Latn` requires at least 24 script-bearing Latin characters + - `und-Hani` requires at least 12 script-bearing Han characters +- For ambiguous Latin text, the detector can also use a corroborated script-bearing sample path before accepting a tag. +- Low-confidence or unreliable WASM detector results fall back to the original `und-*` tag. +- `whatlang`-backed Han detection does not auto-emit `zh-Hans` or `zh-Hant`. - 100% certainty requires explicit metadata (document language tags, user-provided locale, headers) or a language-ID model. diff --git a/docs/plans/jobs/2026-03-23-wasm-detector-phases-1-4-implementation.md b/docs/plans/jobs/2026-03-23-wasm-detector-phases-1-4-implementation.md new file mode 100644 index 0000000..6d5b69a --- /dev/null +++ b/docs/plans/jobs/2026-03-23-wasm-detector-phases-1-4-implementation.md @@ -0,0 +1,70 @@ +--- +title: "wasm detector phases 1-4 implementation" +created-date: 2026-03-23 +modified-date: 2026-03-23 +status: completed +agent: Codex +--- + +## Goal + +Implement the first WASM detector delivery slice across public API, internal detector routing, Rust/WASM build scaffolding, and the detector remap contract while preserving the current regex/script detector as the default. + +## What Changed + +- Added the detector-specific package surface and runtime boundary: + - `src/detector/index.ts` + - `src/detector/index.cjs.ts` + - `src/detector/types.ts` + - `src/detector/none.ts` + - `src/detector/wasm.ts` + - `src/detector/policy.ts` + - `src/detector/whatlang-map.ts` + - `src/detector/whatlang-wasm.ts` + - `src/detector/result-builder.ts` + - `src/detector/sections.ts` +- Added detector-aware CLI and runtime plumbing: + - `--detector ` with default `regex` + - detector mode propagation through single-input and batch paths + - explicit detector subpath export via `@dev-pi2pie/word-counter/detector` +- Added the internal Rust crate and WASM build helper: + - `crates/language-detector/Cargo.toml` + - `crates/language-detector/src/lib.rs` + - `scripts/build-wasm.mjs` +- Updated build and package surface: + - `package.json` + - `tsdown.config.ts` + - `.github/workflows/publish-npm-packages.yml` + - `.github/workflows/publish-github-packages.yml` + - `.gitignore` +- Added the detector remap contract draft: + - `docs/schemas/detector-remap-contract.md` +- Added or updated tests for detector surface and detector-aware routing: + - `test/detector-interop.test.ts` + - `test/word-counter.test.ts` + - `test/command.test.ts` + +## Current Status + +- Phase 1 is implemented and validated. +- Phase 2 detector routing is implemented and validated. +- Phase 3 Rust/WASM crate and build flow are implemented and validated locally. +- Phase 4 remap contract draft is documented. +- Remaining overall plan work lives in later documentation, workflow, and follow-up validation items outside this job record. + +## Validation + +- `bun run type-check` +- `bun run build:wasm` +- `bun run build` +- `bun test test/word-counter.test.ts test/command.test.ts test/detector-interop.test.ts` + +## Related Research + +- `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` +- `docs/schemas/detector-remap-contract.md` + +## Related Plans + +- `docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md` diff --git a/docs/plans/jobs/2026-03-24-detector-cjs-export-fix.md b/docs/plans/jobs/2026-03-24-detector-cjs-export-fix.md new file mode 100644 index 0000000..3bf4d7f --- /dev/null +++ b/docs/plans/jobs/2026-03-24-detector-cjs-export-fix.md @@ -0,0 +1,17 @@ +--- +title: "Fix detector CJS export parity" +created-date: 2026-03-24 +status: completed +agent: codex +--- + +Fix the published CommonJS detector wrapper so it exports the same runtime-unavailable message constant as the ESM detector entry and the detector subpath type surface. + +- Added `WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE` to `src/detector/index.cjs.ts`. +- Extended built-surface CJS interop coverage to assert the detector wrapper exposes the message constant. + +Verification: + +- `bun run build` +- `bun test test/cjs-interop.test.ts` +- `bun run verify:package-contents` diff --git a/docs/plans/jobs/2026-03-24-detector-windowing-refinement.md b/docs/plans/jobs/2026-03-24-detector-windowing-refinement.md new file mode 100644 index 0000000..637221f --- /dev/null +++ b/docs/plans/jobs/2026-03-24-detector-windowing-refinement.md @@ -0,0 +1,100 @@ +--- +title: "detector windowing refinement" +created-date: 2026-03-24 +status: completed +agent: Codex +--- + +## Goal + +Improve detector-mode behavior for ambiguous Latin text by refining how WASM scoring samples are built and accepted before hardening the surrounding CI/CD workflow. + +## What Changed + +- Updated detector-mode scoring in `src/detector/wasm.ts` to evaluate ambiguous windows with: + - raw chunk-window text + - a normalized script-bearing sample path +- Added detector sample normalization in `src/detector/policy.ts`. +- Added a Latin corroboration acceptance rule: + - keep the existing conservative Latin threshold at `>= 0.75` with `reliable = true` + - additionally accept a Latin tag at `>= 0.70` when both raw and normalized samples agree on the same remapped tag +- Added regression coverage in `test/word-counter.test.ts` for: + - long ambiguous Latin promotion + - markdown-like Latin promotion + - short low-confidence English-like fallback +- Updated detector docs: + - `docs/schemas/detector-remap-contract.md` + - `docs/locale-tag-detection-notes.md` +- Marked Phase 6 complete in `docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md`. + +## Smoke Test Results + +Command: + +```bash +bun cli --detector wasm --path ./examples/test-case-multi-files-support --jobs 4 +``` + +Observed output after refinement: + +```text +Total words: 36 +Locale en: 10 words +Locale en: 11 words +Locale und-Latn: 6 words +Locale und-Latn: 9 words +``` + +Collector view: + +```bash +bun cli --detector wasm --path ./examples/test-case-multi-files-support --jobs 4 --mode collector +``` + +Observed output: + +```text +Total words: 36 +Locale en: 21 words +Locale und-Latn: 15 words +``` + +Per-file JSON smoke check: + +```bash +bun cli --detector wasm --path ./examples/test-case-multi-files-support --jobs 4 --per-file --format json --pretty +``` + +Observed locale outcome: + +- `examples/test-case-multi-files-support/a.md` -> `en` +- `examples/test-case-multi-files-support/b.mdx` -> `en` +- `examples/test-case-multi-files-support/c.txt` -> `und-Latn` +- `examples/test-case-multi-files-support/nested/d.markdown` -> `und-Latn` + +## Threshold Decision + +- Keep the base Latin acceptance policy conservative: + - confidence `>= 0.75` + - `reliable = true` +- Keep the base Han acceptance policy conservative: + - confidence `>= 0.90` + - `reliable = true` +- Add the corroborated Latin path at confidence `>= 0.70` only when raw and normalized samples agree on the same remapped tag. + +## Validation + +- `bun run build` +- `bun run type-check` +- `bun test test/word-counter.test.ts` +- `bun test test/word-counter.test.ts test/command.test.ts test/detector-interop.test.ts` + +## Related Plans + +- `docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md` + +## Related Research + +- `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` +- `docs/schemas/detector-remap-contract.md` diff --git a/docs/plans/jobs/2026-03-24-phase7-cicd-follow-up.md b/docs/plans/jobs/2026-03-24-phase7-cicd-follow-up.md new file mode 100644 index 0000000..8c96d57 --- /dev/null +++ b/docs/plans/jobs/2026-03-24-phase7-cicd-follow-up.md @@ -0,0 +1,56 @@ +--- +title: "phase 7 cicd follow-up" +created-date: 2026-03-24 +status: completed +agent: Codex +--- + +## Goal + +Harden the current CI and publish workflows for the new Rust/WASM detector path and verify that packaged npm artifacts include the staged WASM runtime files. + +## What Changed + +- Added package-content verification script: + - `scripts/verify-package-contents.mjs` +- Added package verification npm script: + - `bun run verify:package-contents` +- Updated CI workflow: + - `.github/workflows/ci.yml` + - add Rust cache + - keep a single validation job for now + - verify packaged npm contents after build +- Updated publish workflows: + - `.github/workflows/publish-npm-packages.yml` + - `.github/workflows/publish-github-packages.yml` + - add Rust cache + - verify packaged npm contents before publish + +## Decisions + +- Keep `cargo install wasm-pack --locked` for now. + - Add Rust caching to reduce repeated setup cost. + - Revisit replacement only if workflow time becomes a practical problem. +- Keep a single CI validation job for now. + - Detector-aware validation is not yet split into a separate job. + - Current workflow complexity does not justify extra job coordination yet. +- Do not cache generated WASM outputs as workflow artifacts for now. + - Rebuild them during normal workflow execution. + - Cache Rust dependencies/tooling instead. +- Add package verification before publish. + - Ensure the staged runtime files are actually present in the npm package surface. + +## Validation + +- `bun run build` +- `bun run verify:package-contents` +- `bun run type-check` +- `bun test test/word-counter.test.ts test/command.test.ts test/detector-interop.test.ts` + +## Related Plans + +- `docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md` + +## Related Research + +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` diff --git a/docs/plans/jobs/2026-03-24-release-workflow-consolidation.md b/docs/plans/jobs/2026-03-24-release-workflow-consolidation.md new file mode 100644 index 0000000..5e40ccd --- /dev/null +++ b/docs/plans/jobs/2026-03-24-release-workflow-consolidation.md @@ -0,0 +1,66 @@ +--- +title: "release workflow consolidation" +created-date: 2026-03-24 +status: completed +agent: Codex +--- + +## Goal + +Consolidate release publishing into one workflow so the Rust/WASM build runs once per release and both registries reuse the same prepared package artifact. + +## What Changed + +- Updated `.github/workflows/release.yml` to become the single release orchestrator. +- Kept the existing `notes` job for tag resolution, branch validation, and release-notes generation. +- Added a `prepare` job that: + - checks out the release tag + - sets up Bun, Node, Rust, and `wasm-pack` + - installs dependencies + - runs `bun run build` + - runs `bun run verify:package-contents` + - uploads a release artifact containing: + - `dist/` + - `package.json` + - `README.md` + - `LICENSE*` +- Added `publish_npm` and `publish_github_packages` jobs that download the prepared artifact instead of rebuilding. +- Kept npm trusted publishing behavior inside the new `publish_npm` job. +- Kept GitHub Packages package-name rewriting and prerelease dist-tag behavior inside the new `publish_github_packages` job. +- Removed the duplicated tag-triggered workflows: + - `.github/workflows/publish-npm-packages.yml` + - `.github/workflows/publish-github-packages.yml` +- Updated the release flow so the final GitHub Release record is created only after both publish jobs succeed. + +## Decisions + +- Keep `.github/workflows/ci.yml` as a separate validation workflow. + - CI is still responsible for pull-request and integration-branch health. + - CI is not used as the source of release artifacts. +- Prefer same-workflow artifact reuse over cross-workflow artifact lookup. + - This avoids tag-to-run matching, rerun ambiguity, and artifact-retention coupling. +- Keep tag and branch validation in the `notes` job for now. + - The resolved tag context is already needed there for release-note generation. +- Keep the shared release artifact narrow. + - Publish jobs need package metadata and built outputs, not dependency trees. + +## Validation + +- Parsed `.github/workflows/ci.yml` successfully with the repository `yaml` package. +- Parsed `.github/workflows/release.yml` successfully with the repository `yaml` package. + +## Follow-up + +- Real GitHub Actions validation is still needed for: + - stable vs prerelease routing + - manual `workflow_dispatch` with explicit `tag` + - rerun behavior for failed publish jobs + - confirmation that the uploaded artifact is sufficient for both registries in hosted runners + +## Related Plans + +- `docs/plans/plan-2026-03-24-release-workflow-consolidation.md` + +## Related Research + +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` diff --git a/docs/plans/jobs/2026-03-24-review-findings-package-and-test-fixes.md b/docs/plans/jobs/2026-03-24-review-findings-package-and-test-fixes.md new file mode 100644 index 0000000..91b0186 --- /dev/null +++ b/docs/plans/jobs/2026-03-24-review-findings-package-and-test-fixes.md @@ -0,0 +1,19 @@ +--- +title: "Review findings package and test fixes" +created-date: 2026-03-24 +modified-date: 2026-03-24 +status: completed +agent: codex +--- + +Address follow-up review findings in package metadata and test setup. + +- Align published type entrypoints with the declaration filenames emitted by tsdown. +- Keep the default `bun test` workflow independent of prebuilt `dist/` artifacts. +- Preserve wasm coverage without making clean-checkout tests require generated runtime artifacts. + +Verification: + +- `bun test test/detector-interop.test.ts` +- `bun test test/word-counter.test.ts test/command.test.ts` +- `bun run verify:package-contents` diff --git a/docs/plans/jobs/2026-03-24-review-findings-windows-portability-fixes.md b/docs/plans/jobs/2026-03-24-review-findings-windows-portability-fixes.md new file mode 100644 index 0000000..123c61b --- /dev/null +++ b/docs/plans/jobs/2026-03-24-review-findings-windows-portability-fixes.md @@ -0,0 +1,17 @@ +--- +title: "Review findings windows portability fixes" +created-date: 2026-03-24 +modified-date: 2026-03-24 +status: completed +agent: codex +--- + +Address review findings for Windows portability in the WASM build helper and package typecheck test. + +- Replaced manual `PATH` parsing in `scripts/build-wasm.mjs` with a direct command probe so tool detection respects platform-specific path and executable resolution. +- Updated `test/package-types.test.ts` to invoke the TypeScript CLI through the current JavaScript runtime instead of a Unix-only `node_modules/.bin/tsc` path. + +Verification: + +- `bun test test/package-types.test.ts` +- `bun test` diff --git a/docs/plans/jobs/2026-03-24-root-types-entrypoint-fix.md b/docs/plans/jobs/2026-03-24-root-types-entrypoint-fix.md new file mode 100644 index 0000000..bfe1011 --- /dev/null +++ b/docs/plans/jobs/2026-03-24-root-types-entrypoint-fix.md @@ -0,0 +1,33 @@ +--- +title: "Fix root package types entrypoint" +created-date: 2026-03-24 +modified-date: 2026-03-24 +status: completed +agent: codex +--- + +## What Changed + +- Updated `package.json` so the published root types point at `dist/esm/index2.d.mts`, which is the generated root facade that preserves the documented export names. +- Kept the generated `dist/esm/index.d.mts` and `dist/esm/index2.d.mts` outputs unchanged so the build artifacts still reflect the bundler's native output. +- Added a regression test that type-checks default and named imports from `@dev-pi2pie/word-counter`. + +## Why + +- The declaration bundler emitted the public root facade as `dist/esm/index2.d.mts` because of an internal filename collision, while `dist/esm/index.d.mts` contained minified export aliases. Pointing the package metadata at the generated facade fixes TypeScript consumers without rewriting the emitted build output. + +## Verification + +- Ran `bun test test/package-types.test.ts`. + +## References + +- `rolldown-plugin-dts` README documents ESM declaration chunk generation and code-splitting behavior, which aligns with the root facade collision observed here.[^rolldown-plugin-dts] +- TypeScript module documentation explains why `.d.mts` filename/basename correctness matters for ESM package consumers.[^typescript-modules] +- Related ecosystem issue discussing `.d.ts` / `.d.mts` basename expectations for published packages.[^mkdist-138] +- Related ecosystem issue showing consumer-visible breakage when declaration files do not match the runtime module shape.[^rollup-1541] + +[^rolldown-plugin-dts]: https://github.com/sxzz/rolldown-plugin-dts +[^typescript-modules]: https://www.typescriptlang.org/docs/handbook/modules/theory.html +[^mkdist-138]: https://github.com/unjs/mkdist/issues/138 +[^rollup-1541]: https://github.com/rollup/plugins/issues/1541 diff --git a/docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md b/docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md new file mode 100644 index 0000000..8e788b0 --- /dev/null +++ b/docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md @@ -0,0 +1,174 @@ +--- +title: "WASM language detector implementation" +created-date: 2026-03-23 +modified-date: 2026-03-23 +status: completed +agent: Codex +--- + +## Goal + +Implement an optional WASM-backed language detector for ambiguous script routes while preserving the current regex/script detector as the default behavior for both CLI and library consumers. + +## Scope + +- In scope: + - Add an opt-in detector mode for ambiguous `und-Latn` and `und-Hani` chunks. + - Keep the current regex/script detector as the default path. + - Introduce an internal Rust crate built to WASM and loaded through a TypeScript adapter. + - Support detector-enabled usage from both CLI and library entrypoints. + - Define a detector remap contract document for mapping engine output into public tags and fallback behavior. + - Update build and publish flow so generated WASM runtime artifacts are produced during build/publish and included in the published npm package. +- Out of scope: + - Replacing the current default detector with statistical language ID. + - Committing generated WASM artifacts to the repository. + - Full workspace extraction such as `packages/core` and `packages/cli`. + - Shipping more than one production detector engine in the first pass. + - Auto-inferring `zh-Hans` vs `zh-Hant` from the detector path. + +## Phase Task Items + +### Phase 1 - Public Contract and Runtime Shape + +- [x] Add a detector mode contract that keeps regex/script detection as the default. +- [x] Add CLI support for `--detector ` with: + - `regex` as the default + - `wasm` as the opt-in route +- [x] Keep `--detector-engine ` out of the first public implementation unless a second real engine is added. +- [x] Add detector-aware runtime option plumbing from CLI parsing into count execution. +- [x] Define detector-facing TypeScript types for: + - detector mode + - detector result + - provenance source + - confidence and reliability fields +- [x] Preserve the existing default library API behavior. +- [x] Add an explicit detector-enabled library entrypoint instead of silently mutating the current sync API contract. +- [x] Plan the package-surface changes required for the detector-enabled library entrypoint: + - update the root `package.json` `exports` map + - decide the ESM entry file shape + - decide the CJS entry or compatibility strategy + - preserve or intentionally revise the current CJS interop contract +- [x] Add or extend package-surface tests so the detector-enabled entrypoint is reachable for supported ESM and CJS consumers. + +### Phase 2 - Internal Detector Boundary + +- [x] Add a new detector module boundary under `src/detector/`, proposed: + - `src/detector/index.ts` + - `src/detector/types.ts` + - `src/detector/none.ts` + - `src/detector/wasm.ts` +- [x] Route detector calls only for ambiguous buckets: + - `und-Latn` + - `und-Hani` +- [x] Apply the conservative threshold policy from research: + - count script-bearing characters only + - `und-Latn >= 24` + - `und-Hani >= 12` +- [x] Fall back to the original `und-*` tag when: + - chunk length is below threshold + - detector output is unsupported + - detector confidence is low + - detector reliability is false or otherwise unacceptable +- [x] Keep the default regex/script path unchanged when detector mode is not enabled. + +### Phase 3 - Rust Crate and WASM Build Flow + +- [x] Create the Rust crate at `crates/language-detector/`. +- [x] Start with `whatlang` as the first detector engine behind the WASM route. +- [x] Export a minimal Rust API that accepts: + - text + - coarse route or original ambiguous tag + - returns normalized detector fields needed by TypeScript +- [x] Add a build helper such as `scripts/build-wasm.mjs`. +- [x] Build WASM artifacts with `wasm-pack --target nodejs` into `generated/wasm-language-detector/`. +- [x] Do not commit generated WASM artifacts. +- [x] Copy or stage the runtime files into `dist/` as part of build/publish so the root npm package ships the generated wrapper and `.wasm` artifact. +- [x] Keep the root single-package publish model intact. +- [x] Make the Rust + `wasm-pack` toolchain an explicit build prerequisite wherever publishable artifacts are produced: + - local build workflow + - CI validation workflow + - npm publish workflow + - GitHub Packages publish workflow +- [x] Update automation or workflow setup so those environments provision Rust and `wasm-pack` before invoking the build that generates publishable outputs. +- [x] If any workflow intentionally avoids Rust/WASM setup, define the fallback behavior explicitly rather than assuming the root build can still produce publishable detector-enabled outputs. + +### Phase 4 - Detector Remap Contract + +- [x] Draft a detector remap schema or guide document under `docs/schemas/` or another stable docs location during implementation. +- [x] Define how `whatlang` outputs map into public language tags used by this package. +- [x] Define unsupported-language fallback rules back to `und-*`. +- [x] Define low-confidence and low-reliability fallback behavior. +- [x] Define Han-route policy explicitly: + - allow conservative remaps such as `cmn -> zh` only when accepted by the public contract + - do not auto-emit `zh-Hans` or `zh-Hant` +- [x] Define JSON provenance metadata for detector-assisted assignments. + +### Phase 5 - Integration, Tests, and Documentation + +- [x] Integrate detector-aware routing into the relevant locale segmentation and counting flow without regressing current behavior. +- [x] Add library tests covering: + - default regex behavior remains unchanged + - detector thresholds + - fallback to `und-*` + - detector-enabled library entrypoint behavior +- [x] Add CLI tests covering: + - `--detector regex` + - `--detector wasm` + - detector fallback behavior in JSON output +- [x] Add CLI validation coverage for invalid detector values. +- [x] Add package-surface tests covering: + - detector-enabled ESM export reachability + - detector-enabled CJS reachability or documented non-support + - current CJS wrapper compatibility remains correct for the existing root surface +- [x] Add build verification for generated runtime artifacts being present in the published package surface. +- [x] Add workflow verification for the Rust/`wasm-pack` toolchain path used by publishable builds. +- [x] Update `README.md` and any locale-detection docs with: + - default regex behavior + - `--detector ` + - detector limitations + - fallback semantics + - explicit note that the npm package ships one portable WASM artifact rather than per-OS detector packages + +### Phase 6 - Detector Windowing Refinement + +- [x] Evaluate detector scoring on larger ambiguous spans instead of already-split counting chunks. +- [x] Define a merge policy for nearby `und-Latn` and `und-Hani` chunks across punctuation and whitespace boundaries when detector mode is enabled. +- [x] Run WASM detection on the merged span and compare the result against the current chunk-first detector behavior. +- [x] Define how an accepted detector result is projected back onto the underlying counting chunks without regressing current output semantics. +- [x] Add representative regression samples for short English-like text, frontmatter-heavy markdown, and punctuation-separated ambiguous Latin spans. +- [x] Reassess current confidence and reliability thresholds only after merged-window behavior is measured. + +### Phase 7 - CI/CD Follow-up + +- [x] Review whether `cargo install wasm-pack --locked` should remain the long-term workflow choice or be replaced with a faster cached setup approach. +- [x] Recheck release, publish, and CI workflow duplication now that Rust/WASM setup exists in multiple workflows. +- [x] Decide whether detector-aware validation should be split into separate workflow jobs for faster feedback. +- [x] Decide whether non-publish workflows need artifact caching for Rust and generated WASM outputs. +- [x] Add any additional release verification needed for npm package contents that include the staged WASM runtime. + +## Execution Notes + +- Keep the first implementation narrow and conservative. +- Prefer a Node-based build helper over adding extra shell-helper dependencies unless implementation friction proves otherwise. +- Keep source files and generated artifacts separate: + - source/build workspace under `crates/` and `generated/` + - published runtime artifacts under `dist/` +- Treat the detector remap contract as a public behavior document and avoid leaking raw engine-specific identifiers into user-facing output. +- Do not broaden repository structure beyond the current root package unless the implementation later proves that package splitting is necessary. + +## Validation + +- `bun run build` +- `bun run type-check` +- `bun test test/word-counter.test.ts` +- `bun test test/command.test.ts` + +## Related Plans + +- `docs/plans/plan-2026-01-02-wc-refactor-locale-research.md` + +## Related Research + +- `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` +- `docs/researches/research-2026-01-02-language-detection.md` diff --git a/docs/plans/plan-2026-03-24-release-workflow-consolidation.md b/docs/plans/plan-2026-03-24-release-workflow-consolidation.md new file mode 100644 index 0000000..9deac95 --- /dev/null +++ b/docs/plans/plan-2026-03-24-release-workflow-consolidation.md @@ -0,0 +1,164 @@ +--- +title: "Release workflow consolidation and artifact reuse" +created-date: 2026-03-24 +modified-date: 2026-03-24 +status: active +agent: Codex +--- + +## Goal + +Reduce duplicated Rust/WASM build work across release and publish automation by building publishable artifacts once per release run and reusing them for both npm and GitHub Packages publication. + +## Scope + +- In scope: + - Review the current split between `.github/workflows/ci.yml`, `.github/workflows/release.yml`, `.github/workflows/publish-npm-packages.yml`, and `.github/workflows/publish-github-packages.yml`. + - Define a single release orchestration flow that prepares one verified build artifact and reuses it for both registries. + - Keep branch and tag validation behavior aligned with the current release policy. + - Preserve registry-specific publish behavior such as npm trusted publishing and GitHub Packages package-name rewriting. + - Reduce YAML duplication where practical without obscuring release logic. +- Out of scope: + - Changing package contents or the current WASM packaging model. + - Changing versioning or dist-tag policy. + - Replacing GitHub Releases notes generation logic unless required by consolidation. + - Removing CI validation for pull requests and integration branches. + +## Current Problem + +- The current tag-based release path fans out into three separate workflows: + - `.github/workflows/release.yml` + - `.github/workflows/publish-npm-packages.yml` + - `.github/workflows/publish-github-packages.yml` +- Both publish workflows repeat the same expensive setup and build work: + - Bun setup + - Node setup + - Rust toolchain setup + - `wasm-pack` install + - dependency install + - `bun run build` + - `bun run verify:package-contents` +- Cross-workflow artifact sharing is technically possible, but it adds avoidable complexity around matching tag runs, artifact lookup, reruns, and retention. + +## Recommended Direction + +- Keep `.github/workflows/ci.yml` as the validation workflow for pull requests and selected push branches. +- Turn `.github/workflows/release.yml` into the single tag/manual release orchestrator. +- Build once inside `release.yml`, upload one verified release artifact, and let both publish jobs consume it in the same workflow run. +- Remove `.github/workflows/publish-npm-packages.yml` and `.github/workflows/publish-github-packages.yml` after the consolidated release flow is proven. + +## Target Workflow Shape + +### CI Workflow + +- Trigger on: + - `pull_request` + - selected `push` branches such as `main`, `dev*`, `canary*`, `alpha*`, and `beta*` +- Purpose: + - validate type-check, build, tests, and packaged contents + - do not publish + - do not act as the source of release artifacts for later workflows + +### Release Workflow + +- Trigger on: + - tag `push` + - `workflow_dispatch` +- Jobs: + - `notes` + - resolve tag + - validate allowed branch ancestry + - generate release notes + - `prepare` + - checkout the release ref + - setup Bun, Node, Rust, and `wasm-pack` + - install dependencies + - run `bun run build` + - run `bun run verify:package-contents` + - upload a release artifact that includes the built publish surface + - `publish_npm` + - depends on `prepare` + - downloads the prepared build artifact + - performs npm trusted publishing + - `publish_github_packages` + - depends on `prepare` + - downloads the same prepared build artifact + - applies GitHub Packages package-name override + - publishes to `npm.pkg.github.com` + - `release` + - depends on `notes` + - creates the GitHub release record + +## Artifact Strategy + +- Share artifacts inside the same `release.yml` workflow run instead of across workflows. +- Upload only the publish-relevant outputs and metadata needed by downstream jobs, for example: + - `dist/` + - `package.json` + - `README.md` + - `LICENSE*` +- Do not upload `node_modules`. +- Keep registry-specific mutation outside the shared artifact when possible: + - npm publish can use the prepared package view directly + - GitHub Packages can apply package-name rewriting in its own job after artifact download + +## Phase Task Items + +### Phase 1 - Release Flow Design + +- [x] Confirm the final job graph for `release.yml`. +- [x] Decide whether the `release` job should depend only on `notes` or on successful publication jobs as well. +- [x] Define the exact artifact contents required by both registry jobs. +- [x] Define artifact naming so reruns and prereleases stay easy to inspect. +- [x] Decide whether tag and branch validation should stay in `notes`, move to `prepare`, or be extracted into a shared job. + +### Phase 2 - Workflow Consolidation + +- [x] Move shared release-build logic into `.github/workflows/release.yml`. +- [x] Add a single `prepare` job that builds and verifies the package once. +- [x] Add artifact upload and download steps for downstream publish jobs. +- [x] Move npm publish logic into a `publish_npm` job inside `release.yml`. +- [x] Move GitHub Packages publish logic into a `publish_github_packages` job inside `release.yml`. +- [x] Keep npm trusted publishing permissions and token handling intact. +- [x] Keep GitHub Packages registry and package-name override behavior intact. + +### Phase 3 - Cleanup and Deduplication + +- [x] Remove `.github/workflows/publish-npm-packages.yml` after the consolidated flow is validated. +- [x] Remove `.github/workflows/publish-github-packages.yml` after the consolidated flow is validated. +- [x] Recheck whether any shared setup should move into a composite action or reusable workflow for readability only. +- [x] Recheck whether `ci.yml` and `release.yml` should share any common helper logic for setup or verification. + +### Phase 4 - Validation and Rollout + +- [ ] Validate that stable and prerelease tags still route to the correct publish behavior. +- [ ] Validate that the built WASM runtime is present in the downloaded release artifact and in final published package contents. +- [ ] Validate that manual `workflow_dispatch` still supports explicit `tag` and optional `shallow_since`. +- [ ] Validate rerun behavior for failed publish jobs without requiring a second full build unless the source ref changed. +- [x] Add or update documentation for the new workflow responsibilities and trigger model. + +## Design Notes + +- Prefer same-workflow artifact reuse over `workflow_run` chaining. +- Avoid making `.github/workflows/ci.yml` the producer of release artifacts because CI runs are not release-scoped and are triggered by different events. +- Favor one authoritative release workflow over three independent tag-triggered workflows. +- Keep the release artifact narrow and deterministic so registry jobs publish the same build output. +- Keep tag and branch validation in the `notes` job for now so release-note generation and publish gating continue to share the same resolved tag context. +- Make the `release` job depend on successful registry jobs so a GitHub Release record is not created for a failed publication run. + +## Success Criteria + +- A tag or manual release run builds publishable artifacts exactly once. +- npm and GitHub Packages publishing reuse the same prepared build output from the same workflow run. +- Release notes and publish gates remain consistent with the current branch and prerelease policy. +- `ci.yml` continues to provide non-release validation without becoming part of the release artifact chain. +- The old duplicate publish workflows can be removed without losing current behavior. + +## Related Plans + +- `docs/plans/plan-2026-01-21-publish-flow-checkout.md` +- `docs/plans/plan-2026-03-23-wasm-language-detector-implementation.md` + +## Related Research + +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` diff --git a/docs/researches/research-2026-02-18-wasm-language-detector-spike.md b/docs/researches/research-2026-02-18-wasm-language-detector-spike.md index 942999e..5eb25cd 100644 --- a/docs/researches/research-2026-02-18-wasm-language-detector-spike.md +++ b/docs/researches/research-2026-02-18-wasm-language-detector-spike.md @@ -1,7 +1,8 @@ --- title: "WASM Language Detector Spike for Ambiguous Script Routes" created-date: 2026-02-18 -status: draft +modified-date: 2026-03-24 +status: completed agent: Codex --- @@ -25,8 +26,8 @@ Deliver a spike decision for `v0.1.x`: whether to ship a WASM detector path behi ## Recommended Pipeline 1. Keep current regex/script gate as Step 0 (`und-Latn`, `und-Hani`, etc). -2. Run WASM detection only for ambiguous runs and only when text length is above a minimum threshold. -3. Keep this route opt-in first (for example `--detector wasm`) to avoid changing default performance/behavior. +2. Run WASM detection only for ambiguous runs and only when script-bearing text length is above a minimum threshold. +3. Keep this route opt-in first via `--detector wasm`; default detector mode remains regex/script detection. 4. Add provenance metadata in JSON output to show resolution source (for example `script`, `hint`, `wasm`). ## Candidate Approaches @@ -54,6 +55,15 @@ Deliver a spike decision for `v0.1.x`: whether to ship a WASM detector path behi - Project targets Node.js `>=20` and bundles TS with `tsdown`; optional detector loading should be lazy to avoid startup regression for default path. - CLI and library behavior should remain unchanged unless detector mode is explicitly enabled. +## CLI Shape +- Recommend a new `--detector ` option. +- Initial detector modes: + - `regex` (default) + - `wasm` +- The first `wasm` engine should be `whatlang`. +- Reserve `--detector-engine ` as a future extension point if multiple WASM-backed engines become viable. +- Do not expose engine selection in the first spike unless there is more than one supported implementation. + ## Proposed Spike Plan 1. Build a minimal detector interface in TS: - input: text chunk + wide tag @@ -61,7 +71,10 @@ Deliver a spike decision for `v0.1.x`: whether to ship a WASM detector path behi 2. Implement one Rust/WASM prototype using `whatlang` with `wasm-pack --target nodejs`. 3. Add threshold/routing policy: - only call detector for `und-Latn` / `und-Hani` - - skip very short chunks + - count script-bearing characters only, excluding whitespace, punctuation, and digits + - start with `und-Latn` chunks at `>=24` Latin letters + - start with `und-Hani` chunks at `>=12` Han characters + - skip shorter chunks and keep the original `und-*` tag 4. Add benchmark harness: - regex-only baseline - regex + WASM route on ambiguous corpora @@ -83,11 +96,24 @@ Deliver a spike decision for `v0.1.x`: whether to ship a WASM detector path behi ## Implications / Recommendations - Proceed with a Rust/WASM spike first (Option A), starting with `whatlang`. - Keep detector optional in `v0.1.x`; do not replace the current default route yet. +- Use `--detector wasm` as the opt-in CLI surface; keep regex/script detection as the default. +- Treat `whatlang` as the first engine behind the `wasm` mode rather than exposing a `whatlang`-specific top-level flag. +- Keep `--detector-engine ` as a future-facing extension only; defer public engine selection until there is more than one real implementation candidate. +- Use a conservative threshold policy first: + - measure only script-bearing characters + - start at `>=24` for `und-Latn` + - start at `>=12` for `und-Hani` + - tune later against representative samples +- Emit low-confidence or unreliable detector results as the original ambiguous `und-*` tag instead of forcing a language tag. - Reassess `cld3-asm` only if spike cost is too high and maintenance risk is acceptable. -## Open Questions -- What minimum chunk length should gate WASM detection for acceptable precision? -- Should low-confidence detector results be emitted as `und-*` instead of forced language tags? +## Resolution Notes +- Minimum chunk length should be measured in script-bearing characters, not raw string length. +- Recommended starting gate: + - `und-Latn`: run WASM detection only when a chunk contains at least `24` Latin letters + - `und-Hani`: run WASM detection only when a chunk contains at least `12` Han characters +- Chunks below the threshold should stay on the existing `und-*` route. +- Low-confidence or unreliable detector results should fall back to the original ambiguous `und-*` tag instead of forcing a language tag. ## Related Plans - `docs/plans/plan-2026-01-02-wc-refactor-locale-research.md` diff --git a/docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md b/docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md new file mode 100644 index 0000000..20a4fa8 --- /dev/null +++ b/docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md @@ -0,0 +1,138 @@ +--- +title: "WASM Packaging and Repo Structure for Optional Detector" +created-date: 2026-03-23 +modified-date: 2026-03-24 +status: completed +agent: Codex +--- + +## Goal + +Define the lowest-risk repository and packaging shape for an internal WASM language detector without destabilizing the current single-package TypeScript layout. + +## Milestone Goal + +Choose a packaging direction for the WASM spike that preserves the current Node.js package contract and avoids premature multi-package release work. + +## Key Findings + +- The current repo publishes exactly one package from the repository root via `package.json`, `tsdown.config.ts`, and the release automation under `.github/workflows/release.yml`. +- The current public library path is synchronous from `src/index.ts` into `src/wc/wc.ts` and `src/wc/segment.ts`. Any WASM design that forces async initialization is a larger contract change than the file-structure refactor itself. +- The current build and publish workflows only prepare Bun and Node before running the root build, so a Rust/WASM route requires an explicit toolchain plan for local build, CI, and publish flows. +- The existing WASM spike in `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` is about detector feasibility and routing. It is not the right place to absorb package-boundary, build, and release-process decisions. +- A `packages/cli` + `packages/core-wasm` split would require more than moving files: + - a new workspace layout + - separate build orchestration + - separate publish/version decisions + - updated release automation + - clarified ownership between TypeScript core logic and Rust/WASM artifacts +- The proposed `core-wasm` boundary is not yet the natural domain boundary. Most of the real product logic still lives in the existing TypeScript word-counting core, not in WASM. + +## Recommended Direction + +- Keep the repository as a single published package for the WASM spike. +- Keep the existing `src/` layout as the main TypeScript source tree. +- Add a narrow detector boundary inside `src/` first, then attach an internal Rust/WASM implementation behind it. +- Treat Rust/WASM as an internal build artifact, not as a first-class published package in the first iteration. + +## Suggested Structure + +```text +. +├── package.json +├── src/ +│ ├── detector/ +│ │ ├── index.ts +│ │ ├── types.ts +│ │ ├── none.ts +│ │ └── wasm.ts +│ ├── wc/ +│ └── cli/ +├── crates/ +│ └── language-detector/ +│ ├── Cargo.toml +│ └── src/lib.rs +├── generated/ +│ └── wasm-language-detector/ +├── scripts/ +│ └── build-wasm.mjs +└── .github/workflows/ +``` + +## Naming Note + +- Use `crates/` instead of `rust/`. +- `crates/` is the more idiomatic Rust repository convention because it names the actual packaging/build unit. +- `rust/` describes the implementation language, but this directory is intended to hold one or more Rust crates rather than general Rust-related assets. + +## Why This Is Lower Risk + +- It preserves the current root `package.json` publish model. +- It avoids a workspace migration before the detector value is proven. +- It lets the detector remain optional and lazily routed only for ambiguous tags. +- It limits refactor scope to a new detector seam instead of changing every current import path. +- It keeps future workspace extraction possible if the detector grows into a real product boundary. + +## Internal WASM Build Approach + +1. Create a Rust crate at `crates/language-detector/`. +2. Export a very small API from Rust that accepts text plus a coarse tag bucket and returns a normalized result object. +3. Build the crate with `wasm-pack --target nodejs` into `generated/wasm-language-detector/`. +4. Add a TypeScript adapter in `src/detector/wasm.ts` that loads the generated wrapper only when detector mode is enabled. +5. Keep the existing default path unchanged; only ambiguous buckets should call the detector adapter. + +## NPM Distribution Model + +- Cross-platform packaging is not the main concern for the current WASM direction. +- For the planned `wasm-pack --target nodejs` flow, the npm package should ship one generated JS wrapper plus one `.wasm` artifact as part of the published package contents. +- Users should install one npm package regardless of OS. This is different from native addon distribution, which often requires per-platform binaries or optional platform packages. +- The application should load the packaged local WASM artifact at runtime through the generated Node-target wrapper instead of selecting among OS-specific builds. +- The practical packaging requirement is to ensure the generated runtime files are included in the published package, either by copying them into `dist/` or by explicitly including the generated output path in `package.json`. + +## API Guidance + +- Do not let the WASM spike silently turn `wordCounter()` into an async API. +- Prefer dual entrypoints instead of forcing a single migration path: + - keep the current default library API unchanged + - add explicit detector-enabled entrypoints for both CLI and library usage +- Any detector-enabled library entrypoint must be planned as a package-surface change: + - update the root `exports` map + - decide the ESM entry location + - decide the CJS entry or compatibility strategy + - extend interop tests so the new surface is reachable for supported consumers +- If the generated Node-target WASM wrapper can be loaded synchronously in practice, confirm that in the spike before promising sync library support. + +## When `packages/` Becomes Worth It + +- Move to a workspace only when at least one of these becomes true: + - the CLI and library need independent versioning + - the detector must be published or consumed independently + - Rust/WASM build steps become large enough to justify their own package lifecycle + - test/build/release time is materially cleaner with isolated package boundaries + +## Recommendation + +- Draft a new research document for the packaging/refactor question instead of revising the existing WASM detector spike in place. +- Keep `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` focused on detector feasibility, routing, and candidate engines. +- Use this research as the decision record for repository shape, build flow, and API-risk boundaries. + +## Resolution Notes + +- Generated WASM artifacts should not be committed. They should be produced during build and publish flows. +- Because generated WASM artifacts are produced during build and publish, the implementation must explicitly provision Rust and `wasm-pack` anywhere `bun run build` is expected to create publishable outputs. +- Prefer a Node-based build helper such as `scripts/build-wasm.mjs` over adding `shx` by default. +- Add `shx` only if the implementation later needs cross-platform shell-style file operations that are materially simpler than using Node standard library calls. +- The first detector rollout should support both surfaces: + - CLI via detector-specific options + - library via an explicit detector-enabled entrypoint +- Do not plan a workspace extraction now. If package splitting ever becomes necessary later, reevaluate it then instead of treating it as an active design target in this phase. + +## Related Plans + +- `docs/plans/plan-2026-01-02-wc-refactor-locale-research.md` +- `docs/plans/plan-2026-01-01-node-runtime-refactor.md` + +## Related Research + +- `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` +- `docs/researches/research-2026-01-02-language-detection.md` diff --git a/docs/schemas/detector-remap-contract.md b/docs/schemas/detector-remap-contract.md new file mode 100644 index 0000000..ca70b41 --- /dev/null +++ b/docs/schemas/detector-remap-contract.md @@ -0,0 +1,143 @@ +--- +title: "Detector Remap Contract (Draft)" +created-date: 2026-03-23 +modified-date: 2026-03-23 +status: draft +agent: Codex +--- + +# Detector Remap Contract (Draft) + +This document defines how detector-engine output is remapped into the public locale tags exposed by `word-counter`. + +## Scope + +- Applies to detector-enabled flows only. +- Current first engine is `whatlang` behind `--detector wasm`. +- Default regex/script detection remains unchanged and is out of scope for this remap contract except where fallback returns to `und-*`. + +## Detector Mode Model + +- `regex` + - default mode + - uses current Unicode script and hint logic only +- `wasm` + - optional detector-assisted mode + - currently planned around `whatlang` + +## Route Gating + +The detector only runs for ambiguous script routes. + +- `und-Latn` + - minimum script-bearing characters: `24` +- `und-Hani` + - minimum script-bearing characters: `12` + +Script-bearing characters mean characters in the relevant script only. +Whitespace, punctuation, symbols, and digits do not count toward the threshold. + +If a chunk is below the threshold, it stays on the original `und-*` route. + +## Confidence and Reliability Policy + +- The detector result must satisfy both: + - confidence threshold for the route + - reliability requirement for the route +- Current draft thresholds: + - `und-Latn`: confidence `>= 0.75` and `reliable = true` + - `und-Hani`: confidence `>= 0.90` and `reliable = true` +- Current Latin corroboration rule: + - if the raw detector sample and the normalized script-bearing detector sample agree on the same remapped Latin tag, accept at confidence `>= 0.70` + - this corroboration path exists to improve noisy markdown-like Latin text without broadly lowering the default Latin threshold + +If the detector result does not satisfy the route policy, the result falls back to the original ambiguous `und-*` tag. + +## `whatlang` Input/Output Normalization + +`whatlang` returns ISO 639-3 language identifiers plus script, confidence, and reliability signals. +Those values are not emitted directly as the public package contract. + +The public output must use this package's own locale-tag contract instead. + +## Route-Specific Remap Rules + +### `und-Latn` + +Allowed remaps: + +| `whatlang` ISO 639-3 | Public tag | +| --- | --- | +| `eng` | `en` | +| `fra` | `fr` | +| `deu` | `de` | +| `spa` | `es` | +| `por` | `pt` | +| `ita` | `it` | +| `nld` | `nl` | +| `pol` | `pl` | +| `tur` | `tr` | +| `ron` | `ro` | +| `hun` | `hu` | +| `ces` | `cs` | +| `dan` | `da` | +| `swe` | `sv` | +| `fin` | `fi` | +| `cat` | `ca` | +| `lat` | `la` | + +Rules: + +- The detector result must report `Latin` script. +- Unsupported languages fall back to `und-Latn`. + +### `und-Hani` + +Allowed remaps: + +| `whatlang` ISO 639-3 | Public tag | +| --- | --- | +| `cmn` | `zh` | +| `jpn` | `ja` | + +Rules: + +- The detector result must report `Mandarin` script from `whatlang`. +- Do not auto-emit `zh-Hans` or `zh-Hant`. +- Unsupported languages fall back to `und-Hani`. + +## Fallback Rules + +Return to the original ambiguous `und-*` tag when any of the following is true: + +- chunk length is below the route threshold +- the detector returns no result +- the detector script does not match the route +- the detected language is not in the route allow-list +- confidence is below threshold +- reliability is false + +## Provenance Metadata + +Detector-assisted output should reserve room for provenance metadata in JSON output. + +Draft source values: + +- `script` +- `hint` +- `wasm` + +Defined draft placement: + +- top-level `meta.detector.mode` +- top-level `meta.detector.provenance = "per-item"` +- chunk-style detector-assisted items may include `source` + +This document defines the allowed provenance values and remap behavior. +The broader JSON contract is documented in `docs/schemas/json-output-contract.md`. + +## Related Docs + +- `docs/researches/research-2026-02-18-wasm-language-detector-spike.md` +- `docs/researches/research-2026-03-23-wasm-packaging-repo-structure.md` +- `docs/schemas/json-output-contract.md` diff --git a/docs/schemas/json-output-contract.md b/docs/schemas/json-output-contract.md index a4c8415..c1f723b 100644 --- a/docs/schemas/json-output-contract.md +++ b/docs/schemas/json-output-contract.md @@ -1,7 +1,7 @@ --- title: "JSON Output Contract" created-date: 2026-02-17 -modified-date: 2026-02-17 +modified-date: 2026-03-23 status: completed agent: Codex --- @@ -141,6 +141,56 @@ When skip diagnostics are enabled (debug + not quiet skips), per-file payloads i When non-word collection is enabled, `counts` and non-word breakdown fields are present. Whitespace details appear when whitespace collection is enabled. +### Detector Metadata (`--detector`) + +Detector-aware runs reserve `meta.detector` for detector-related metadata. + +Draft shape: + +```json +{ + "meta": { + "detector": { + "mode": "wasm", + "provenance": "per-item" + } + } +} +``` + +Draft per-item provenance: + +- chunk-style items may include `source` +- allowed source values: + - `script` + - `hint` + - `wasm` + +Example (draft shape): + +```json +{ + "total": 13, + "breakdown": { + "mode": "chunk", + "items": [ + { "locale": "en", "source": "wasm", "words": 13 } + ] + }, + "meta": { + "detector": { + "mode": "wasm", + "provenance": "per-item" + } + } +} +``` + +Notes: + +- Detector provenance is relevant only when detector-aware routes are enabled. +- Aggregated collector-style outputs do not guarantee per-assignment provenance. + ## Contract Rules - `scope` is present only for per-file batch payloads. diff --git a/package.json b/package.json index 2082f64..6546d16 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@dev-pi2pie/word-counter", - "version": "0.1.5-canary.1", + "version": "0.1.5-canary.2", "keywords": [ "cli", "intl-segmenter", @@ -28,17 +28,24 @@ "type": "module", "main": "./dist/cjs/index.cjs", "module": "./dist/esm/index.mjs", - "types": "./dist/esm/index.d.ts", + "types": "./dist/esm/index2.d.mts", "exports": { ".": { - "types": "./dist/esm/index.d.ts", + "types": "./dist/esm/index2.d.mts", "import": "./dist/esm/index.mjs", "require": "./dist/cjs/index.cjs" }, + "./detector": { + "types": "./dist/esm/detector.d.mts", + "import": "./dist/esm/detector.mjs", + "require": "./dist/cjs/detector.cjs" + }, "./package.json": "./package.json" }, "scripts": { - "build": "node scripts/generate-embedded-version.mjs && tsdown", + "build": "node scripts/generate-embedded-version.mjs && tsdown && node scripts/build-wasm.mjs", + "build:wasm": "node scripts/build-wasm.mjs", + "verify:package-contents": "node scripts/verify-package-contents.mjs", "type-check": "tsc -p tsconfig.json", "test": "bun test", "test:ci": "bun run build && bun test", @@ -46,14 +53,14 @@ }, "dependencies": { "commander": "^14.0.3", - "yaml": "^2.8.2" + "yaml": "^2.8.3" }, "devDependencies": { "@types/node": "^25.5.0", - "oxfmt": "^0.40.0", - "oxlint": "^1.55.0", + "oxfmt": "^0.41.0", + "oxlint": "^1.56.0", "picocolors": "^1.1.1", - "tsdown": "^0.21.2", + "tsdown": "^0.21.4", "typescript": "^5.9.3" }, "peerDependencies": { diff --git a/scripts/build-wasm.mjs b/scripts/build-wasm.mjs new file mode 100644 index 0000000..4fb14ea --- /dev/null +++ b/scripts/build-wasm.mjs @@ -0,0 +1,78 @@ +import { cp, mkdir, rm } from "node:fs/promises"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { spawn, spawnSync } from "node:child_process"; + +const scriptDir = dirname(fileURLToPath(import.meta.url)); +const repoRoot = resolve(scriptDir, ".."); +const crateDir = join(repoRoot, "crates", "language-detector"); +const generatedDir = join(repoRoot, "generated", "wasm-language-detector"); +const distRuntimeDir = join(repoRoot, "dist", "wasm-language-detector"); + +function runCommand(command, args, cwd) { + return new Promise((resolvePromise, rejectPromise) => { + const child = spawn(command, args, { + cwd, + stdio: "inherit", + env: process.env, + }); + + child.on("error", rejectPromise); + child.on("exit", (code) => { + if (code === 0) { + resolvePromise(undefined); + return; + } + rejectPromise(new Error(`${command} ${args.join(" ")} failed with exit code ${code ?? "unknown"}.`)); + }); + }); +} + +function assertCommandAvailable(command) { + const result = spawnSync(command, ["--version"], { + stdio: "ignore", + env: process.env, + }); + + if (!result.error && result.status === 0) { + return; + } + + throw new Error( + `Missing required command: ${command}. Install the Rust/WASM toolchain before running this build.`, + ); +} + +async function copyRuntimeArtifacts() { + await rm(distRuntimeDir, { recursive: true, force: true }); + await mkdir(join(repoRoot, "dist"), { recursive: true }); + await cp(generatedDir, distRuntimeDir, { recursive: true }); + await rm(join(distRuntimeDir, ".gitignore"), { force: true }); +} + +async function main() { + assertCommandAvailable("cargo"); + assertCommandAvailable("wasm-pack"); + + await rm(generatedDir, { recursive: true, force: true }); + await mkdir(join(repoRoot, "generated"), { recursive: true }); + + await runCommand( + "wasm-pack", + [ + "build", + "--target", + "nodejs", + "--release", + "--out-dir", + "../../generated/wasm-language-detector", + "--out-name", + "language_detector", + ], + crateDir, + ); + + await copyRuntimeArtifacts(); +} + +await main(); diff --git a/scripts/verify-package-contents.mjs b/scripts/verify-package-contents.mjs new file mode 100644 index 0000000..94d5176 --- /dev/null +++ b/scripts/verify-package-contents.mjs @@ -0,0 +1,102 @@ +import { spawnSync } from "node:child_process"; +import { mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +const requiredFiles = new Set([ + "dist/esm/index.mjs", + "dist/esm/index.d.mts", + "dist/cjs/index.cjs", + "dist/esm/detector.mjs", + "dist/esm/detector.d.mts", + "dist/cjs/detector.cjs", + "dist/wasm-language-detector/language_detector.js", + "dist/wasm-language-detector/language_detector_bg.wasm", +]); + +function normalizePackagePath(value) { + return typeof value === "string" ? value.replace(/^\.\//u, "") : null; +} + +function collectReferencedPackagePaths() { + const packageJsonPath = process.env.npm_package_json ?? join(process.cwd(), "package.json"); + const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf8")); + const referencedPaths = new Set(); + + const maybeAdd = (value) => { + const normalized = normalizePackagePath(value); + if (normalized) { + referencedPaths.add(normalized); + } + }; + + maybeAdd(packageJson.main); + maybeAdd(packageJson.module); + maybeAdd(packageJson.types); + + const visitExports = (value) => { + if (typeof value === "string") { + maybeAdd(value); + return; + } + if (!value || typeof value !== "object") { + return; + } + for (const nested of Object.values(value)) { + visitExports(nested); + } + }; + + visitExports(packageJson.exports); + return referencedPaths; +} + +const tempCacheDir = mkdtempSync(join(tmpdir(), "word-counter-npm-cache-")); + +const result = spawnSync("npm", ["pack", "--dry-run", "--json"], { + cwd: process.cwd(), + encoding: "utf8", + env: { + ...process.env, + npm_config_cache: tempCacheDir, + }, +}); + +if (result.status !== 0) { + process.stderr.write(result.stderr || result.stdout || "npm pack --dry-run failed.\n"); + process.exit(result.status ?? 1); +} + +let payload; +try { + payload = JSON.parse(result.stdout); +} catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Failed to parse npm pack --dry-run output: ${message}`); + process.exit(1); +} + +const files = Array.isArray(payload) && payload[0] && Array.isArray(payload[0].files) + ? payload[0].files + : []; + +const presentPaths = new Set( + files + .map((file) => (typeof file?.path === "string" ? file.path : null)) + .filter((path) => typeof path === "string"), +); + +const expectedPaths = new Set([...requiredFiles, ...collectReferencedPackagePaths()]); +const missingFiles = [...expectedPaths].filter((path) => !presentPaths.has(path)); + +if (missingFiles.length > 0) { + console.error("Missing required package contents:"); + for (const file of missingFiles) { + console.error(`- ${file}`); + } + rmSync(tempCacheDir, { recursive: true, force: true }); + process.exit(1); +} + +console.log("Package contents verified."); +rmSync(tempCacheDir, { recursive: true, force: true }); diff --git a/src/cli/batch/jobs/load-count-worker.ts b/src/cli/batch/jobs/load-count-worker.ts index 42dadcb..2e5e34c 100644 --- a/src/cli/batch/jobs/load-count-worker.ts +++ b/src/cli/batch/jobs/load-count-worker.ts @@ -91,6 +91,7 @@ export async function countBatchInputsWithWorkerJobs( filePaths, jobs: options.jobs, section: options.section, + detectorMode: options.detectorMode ?? "regex", wcOptions: options.wcOptions, preserveCollectorSegments: options.preserveCollectorSegments, onFileProcessed: options.onFileProcessed, diff --git a/src/cli/batch/jobs/load-count.ts b/src/cli/batch/jobs/load-count.ts index bb24ed9..3f51b67 100644 --- a/src/cli/batch/jobs/load-count.ts +++ b/src/cli/batch/jobs/load-count.ts @@ -1,4 +1,5 @@ import { countSections } from "../../../markdown"; +import { countSectionsWithDetector, wordCounterWithDetector } from "../../../detector"; import wordCounter from "../../../wc"; import { compactCollectorSegmentsInCountResult } from "../aggregate"; import { resolveBatchJobsLimit } from "./limits"; @@ -20,6 +21,7 @@ export async function countBatchInputsWithJobs( filePaths: string[], options: CountBatchWithJobsOptions, ): Promise { + const detectorMode = options.detectorMode ?? "regex"; const limits = resolveBatchJobsLimit(); const total = filePaths.length; let completed = 0; @@ -39,9 +41,19 @@ export async function countBatchInputsWithJobs( } const result = - options.section === "all" - ? wordCounter(loaded.content, options.wcOptions) - : countSections(loaded.content, options.section, options.wcOptions); + detectorMode === "regex" + ? options.section === "all" + ? wordCounter(loaded.content, options.wcOptions) + : countSections(loaded.content, options.section, options.wcOptions) + : options.section === "all" + ? await wordCounterWithDetector(loaded.content, { + ...options.wcOptions, + detector: detectorMode, + }) + : await countSectionsWithDetector(loaded.content, options.section, { + ...options.wcOptions, + detector: detectorMode, + }); if (!options.preserveCollectorSegments) { compactCollectorSegmentsInCountResult(result); diff --git a/src/cli/batch/jobs/types.ts b/src/cli/batch/jobs/types.ts index 488c191..9fdc196 100644 --- a/src/cli/batch/jobs/types.ts +++ b/src/cli/batch/jobs/types.ts @@ -1,4 +1,5 @@ import type { SectionMode } from "../../../markdown"; +import type { DetectorMode } from "../../../detector"; import type wordCounter from "../../../wc"; import type { BatchFileResult, BatchSkip } from "../../types"; import type { BatchProgressSnapshot } from "../../progress/reporter"; @@ -15,6 +16,7 @@ export type BatchJobsLimit = { export type CountBatchWithJobsOptions = { jobs: number; section: SectionMode; + detectorMode?: DetectorMode; wcOptions: Parameters[1]; preserveCollectorSegments: boolean; onFileProcessed?: (snapshot: BatchProgressSnapshot) => void; diff --git a/src/cli/batch/jobs/worker-pool.ts b/src/cli/batch/jobs/worker-pool.ts index 45e23b3..6749618 100644 --- a/src/cli/batch/jobs/worker-pool.ts +++ b/src/cli/batch/jobs/worker-pool.ts @@ -2,6 +2,7 @@ import { existsSync } from "node:fs"; import { fileURLToPath } from "node:url"; import { Worker } from "node:worker_threads"; import type { SectionMode } from "../../../markdown"; +import type { DetectorMode } from "../../../detector"; import type wordCounter from "../../../wc"; import type { BatchProgressSnapshot } from "../../progress/reporter"; import type { BatchFileResult, BatchSkip } from "../../types"; @@ -15,6 +16,7 @@ type CountBatchInputsWithWorkerPoolOptions = { filePaths: string[]; jobs: number; section: SectionMode; + detectorMode: DetectorMode; wcOptions: Parameters[1]; preserveCollectorSegments: boolean; onFileProcessed?: (snapshot: BatchProgressSnapshot) => void; @@ -183,6 +185,7 @@ export async function countBatchInputsWithWorkerPool( worker = new Worker(workerEntryUrl, { workerData: { section: options.section, + detectorMode: options.detectorMode, wcOptions: options.wcOptions, preserveCollectorSegments: options.preserveCollectorSegments, }, diff --git a/src/cli/batch/jobs/worker/count-worker.ts b/src/cli/batch/jobs/worker/count-worker.ts index 4accf28..4a7bb8c 100644 --- a/src/cli/batch/jobs/worker/count-worker.ts +++ b/src/cli/batch/jobs/worker/count-worker.ts @@ -1,6 +1,7 @@ import { readFile } from "node:fs/promises"; import { parentPort, workerData } from "node:worker_threads"; import { countSections } from "../../../../markdown"; +import { countSectionsWithDetector, wordCounterWithDetector } from "../../../../detector"; import wordCounter from "../../../../wc"; import { compactCollectorSegmentsInCountResult } from "../../aggregate"; import { isProbablyBinary } from "../../../path/load"; @@ -77,9 +78,19 @@ parentPort.on("message", async (message: WorkerRequestMessage) => { try { const content = buffer.toString("utf8"); const result = - config.section === "all" - ? wordCounter(content, config.wcOptions) - : countSections(content, config.section, config.wcOptions); + config.detectorMode === "regex" + ? config.section === "all" + ? wordCounter(content, config.wcOptions) + : countSections(content, config.section, config.wcOptions) + : config.section === "all" + ? await wordCounterWithDetector(content, { + ...config.wcOptions, + detector: config.detectorMode, + }) + : await countSectionsWithDetector(content, config.section, { + ...config.wcOptions, + detector: config.detectorMode, + }); if (!config.preserveCollectorSegments) { compactCollectorSegmentsInCountResult(result); diff --git a/src/cli/batch/jobs/worker/protocol.ts b/src/cli/batch/jobs/worker/protocol.ts index 58020d2..9f967bf 100644 --- a/src/cli/batch/jobs/worker/protocol.ts +++ b/src/cli/batch/jobs/worker/protocol.ts @@ -1,9 +1,11 @@ import type { SectionMode, SectionedResult } from "../../../../markdown"; +import type { DetectorMode } from "../../../../detector"; import type { WordCounterOptions, WordCounterResult } from "../../../../wc"; import type { BatchSkip } from "../../../types"; export type WorkerConfig = { section: SectionMode; + detectorMode: DetectorMode; wcOptions: WordCounterOptions; preserveCollectorSegments: boolean; }; diff --git a/src/cli/batch/run.ts b/src/cli/batch/run.ts index c4e62e9..0601d99 100644 --- a/src/cli/batch/run.ts +++ b/src/cli/batch/run.ts @@ -1,4 +1,5 @@ import type { SectionMode } from "../../markdown"; +import type { DetectorWordCounterOptions } from "../../detector"; import { appendAll } from "../../utils/append-all"; import type wordCounter from "../../wc"; import type { DebugChannel } from "../debug/channel"; @@ -19,7 +20,7 @@ type RunBatchCountOptions = { batchOptions: BatchOptions; extensionFilter: DirectoryExtensionFilter; section: SectionMode; - wcOptions: Parameters[1]; + wcOptions: DetectorWordCounterOptions; preserveCollectorSegments: boolean; debug: DebugChannel; progressReporter: BatchProgressReporter; @@ -99,6 +100,7 @@ export async function runBatchCount(options: RunBatchCountOptions): Promise { @@ -129,6 +131,7 @@ export async function runBatchCount(options: RunBatchCountOptions): Promise { @@ -142,6 +145,7 @@ export async function runBatchCount(options: RunBatchCountOptions): Promise { diff --git a/src/cli/program/options.ts b/src/cli/program/options.ts index d444adc..624edb1 100644 --- a/src/cli/program/options.ts +++ b/src/cli/program/options.ts @@ -4,9 +4,11 @@ import { collectExtensionOption } from "../path/filter"; import { parseTotalOfOption } from "../total-of"; import type { PathMode } from "../types"; import type { OutputFormat } from "../runtime/types"; +import type { DetectorMode } from "../../detector"; import type { WordCounterMode } from "../../wc"; const MODE_CHOICES: WordCounterMode[] = ["chunk", "segments", "collector", "char", "char-collector"]; +const DETECTOR_CHOICES: DetectorMode[] = ["regex", "wasm"]; const FORMAT_CHOICES: OutputFormat[] = ["standard", "raw", "json"]; const SECTION_CHOICES: SectionMode[] = [ "all", @@ -60,6 +62,11 @@ export function configureProgramOptions( .choices(SECTION_CHOICES) .default("all"), ) + .addOption( + new Option("--detector ", "locale detector mode") + .choices(DETECTOR_CHOICES) + .default("regex"), + ) .addOption( new Option( "--path-mode ", diff --git a/src/cli/program/version-embedded.ts b/src/cli/program/version-embedded.ts index d319787..75cb803 100644 --- a/src/cli/program/version-embedded.ts +++ b/src/cli/program/version-embedded.ts @@ -1,3 +1,3 @@ // This file is generated by scripts/generate-embedded-version.mjs. // Do not edit manually. -export const EMBEDDED_PACKAGE_VERSION = "0.1.5-canary.1"; +export const EMBEDDED_PACKAGE_VERSION = "0.1.5-canary.2"; diff --git a/src/cli/runtime/options.ts b/src/cli/runtime/options.ts index c918b34..a47270d 100644 --- a/src/cli/runtime/options.ts +++ b/src/cli/runtime/options.ts @@ -181,6 +181,7 @@ function resolveLatinHintRules(options: CliActionOptions): LatinHintRule[] | und export function resolveCountRunOptions(options: CliActionOptions): ResolvedCountRunOptions { const useSection = options.section !== "all"; + const detectorMode = options.detector ?? "regex"; const totalOfParts = options.totalOf; const requestedNonWords = Boolean(options.nonWords || options.includeWhitespace || options.misc); const collectNonWordsForOverride = requiresNonWordCollection(totalOfParts); @@ -195,10 +196,12 @@ export function resolveCountRunOptions(options: CliActionOptions): ResolvedCount return { useSection, + detectorMode, totalOfParts, requestedNonWords, shouldNormalizeBaseOutput, wcOptions: { + detector: detectorMode, mode: options.mode, latinLanguageHint: options.latinLanguage, latinTagHint: options.latinTag, diff --git a/src/cli/runtime/single.ts b/src/cli/runtime/single.ts index 531f2f5..c5caa84 100644 --- a/src/cli/runtime/single.ts +++ b/src/cli/runtime/single.ts @@ -1,5 +1,9 @@ import type { SectionedResult } from "../../markdown"; import { countSections } from "../../markdown"; +import { + countSectionsWithDetector, + wordCounterWithDetector, +} from "../../detector"; import { getTotalLabels, isSectionedResult, @@ -37,8 +41,18 @@ export async function executeSingleCount({ } const result: WordCounterResult | SectionedResult = resolved.useSection - ? countSections(trimmed, options.section, resolved.wcOptions) - : wordCounter(trimmed, resolved.wcOptions); + ? resolved.detectorMode === "regex" + ? countSections(trimmed, options.section, resolved.wcOptions) + : await countSectionsWithDetector(trimmed, options.section, { + ...resolved.wcOptions, + detector: resolved.detectorMode, + }) + : resolved.detectorMode === "regex" + ? wordCounter(trimmed, resolved.wcOptions) + : await wordCounterWithDetector(trimmed, { + ...resolved.wcOptions, + detector: resolved.detectorMode, + }); const totalOfOverride = resolveTotalOfOverride(result, resolved.totalOfParts); const displayResult = resolved.shouldNormalizeBaseOutput ? normalizeResultBase(result) : result; diff --git a/src/cli/runtime/types.ts b/src/cli/runtime/types.ts index db7f649..452ba3c 100644 --- a/src/cli/runtime/types.ts +++ b/src/cli/runtime/types.ts @@ -3,6 +3,7 @@ import type { ProgressOutputStream } from "../progress/reporter"; import type { TotalOfPart } from "../total-of"; import type { PathMode } from "../types"; import type { DoctorRuntimeOverrides } from "../doctor/types"; +import type { DetectorMode, DetectorWordCounterOptions } from "../../detector"; import type { WordCounterMode, WordCounterOptions, WordCounterResult } from "../../wc"; export type OutputFormat = "standard" | "raw" | "json"; @@ -12,6 +13,7 @@ export type CliActionOptions = { format: OutputFormat; pretty: boolean; section: SectionMode; + detector: DetectorMode; latinLanguage?: string; latinTag?: string; latinLocale?: string; @@ -50,10 +52,11 @@ export type RunCliOptions = { export type ResolvedCountRunOptions = { useSection: boolean; + detectorMode: DetectorMode; totalOfParts: TotalOfPart[] | undefined; requestedNonWords: boolean; shouldNormalizeBaseOutput: boolean; - wcOptions: WordCounterOptions; + wcOptions: DetectorWordCounterOptions; }; export type CountResult = WordCounterResult | SectionedResult; diff --git a/src/command.ts b/src/command.ts index 53e99ed..8bdd1a1 100644 --- a/src/command.ts +++ b/src/command.ts @@ -5,6 +5,7 @@ import { configureProgramOptions } from "./cli/program/options"; import { getFormattedVersionLabel } from "./cli/program/version"; import { resolveBatchJobsLimit } from "./cli/batch/jobs/limits"; import { executeBatchCount } from "./cli/runtime/batch"; +import { WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE } from "./detector"; import { hasPathInput, resolveCountRunOptions, @@ -140,6 +141,11 @@ export async function runCli( program.error(pc.red(message)); return; } + if (message === WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE) { + console.error(pc.red(message)); + process.exitCode = 1; + return; + } program.error(message); } finally { await debug.close(); @@ -148,6 +154,9 @@ export async function runCli( ); await program.parseAsync(argv); + if (process.exitCode === undefined) { + process.exitCode = 0; + } } export { buildBatchSummary } from "./cli/batch/aggregate"; diff --git a/src/detector/index.cjs.ts b/src/detector/index.cjs.ts new file mode 100644 index 0000000..7d107d1 --- /dev/null +++ b/src/detector/index.cjs.ts @@ -0,0 +1,29 @@ +import { + assertDetectorModeImplemented, + countSectionsWithDetector, + createDetectorResult, + DEFAULT_DETECTOR_MODE, + DEFAULT_DETECTOR_RESULT_SOURCE, + DETECTOR_MODES, + DETECTOR_SOURCES, + resolveDetectorMode, + segmentTextByLocaleWithDetector, + WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE, + wordCounterWithDetector, +} from "./index"; + +const cjsExports = { + assertDetectorModeImplemented, + countSectionsWithDetector, + createDetectorResult, + DEFAULT_DETECTOR_MODE, + DEFAULT_DETECTOR_RESULT_SOURCE, + DETECTOR_MODES, + DETECTOR_SOURCES, + resolveDetectorMode, + segmentTextByLocaleWithDetector, + WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE, + wordCounterWithDetector, +}; + +export = cjsExports; diff --git a/src/detector/index.ts b/src/detector/index.ts new file mode 100644 index 0000000..70e2e9b --- /dev/null +++ b/src/detector/index.ts @@ -0,0 +1,96 @@ +import type { SectionMode } from "../markdown"; +import type { LocaleChunk } from "../wc/types"; +import { + countSectionsWithRegexDetector, + segmentTextByLocaleWithRegexDetector, + wordCounterWithRegexDetector, +} from "./none"; +import { + countSectionsWithWasmDetector, + segmentTextByLocaleWithWasmDetector, + WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE, + wordCounterWithWasmDetector, +} from "./wasm"; +import type { + DetectorCountSectionsOptions, + DetectorLocaleOptions, + DetectorMode, + DetectorResult, + DetectorSource, + DetectorWordCounterOptions, +} from "./types"; + +export type { + DetectorCountSections, + DetectorCountSectionsOptions, + DetectorCountResult, + DetectorLocaleOptions, + DetectorMode, + DetectorResult, + DetectorRuntimeOptions, + DetectorSource, + DetectorWordCounterOptions, +} from "./types"; + +export const DETECTOR_MODES: DetectorMode[] = ["regex", "wasm"]; +export const DEFAULT_DETECTOR_MODE: DetectorMode = "regex"; + +export function resolveDetectorMode(mode?: DetectorMode): DetectorMode { + return mode ?? DEFAULT_DETECTOR_MODE; +} + +export function assertDetectorModeImplemented(mode?: DetectorMode): void { + void mode; +} + +export async function segmentTextByLocaleWithDetector( + text: string, + options: DetectorLocaleOptions = {}, +): Promise { + const mode = resolveDetectorMode(options.detector); + if (mode === "wasm") { + return segmentTextByLocaleWithWasmDetector(text, options); + } + return segmentTextByLocaleWithRegexDetector(text, options); +} + +export async function wordCounterWithDetector( + text: string, + options: DetectorWordCounterOptions = {}, +) { + const mode = resolveDetectorMode(options.detector); + if (mode === "wasm") { + return wordCounterWithWasmDetector(text, options); + } + return wordCounterWithRegexDetector(text, options); +} + +export async function countSectionsWithDetector( + input: string, + section: SectionMode, + options: DetectorCountSectionsOptions = {}, +) { + const mode = resolveDetectorMode(options.detector); + if (mode === "wasm") { + return countSectionsWithWasmDetector(input, section, options); + } + return countSectionsWithRegexDetector(input, section, options); +} + +export const DETECTOR_SOURCES: DetectorSource[] = ["script", "hint", "wasm"]; +export const DEFAULT_DETECTOR_RESULT_SOURCE: DetectorSource = "script"; +export { WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE }; + +export function createDetectorResult( + tag: string, + source: DetectorSource = DEFAULT_DETECTOR_RESULT_SOURCE, + confidence?: number, + reliable?: boolean, +): DetectorResult { + return { + tag, + source, + ...(confidence === undefined ? {} : { confidence }), + ...(reliable === undefined ? {} : { reliable }), + }; +} diff --git a/src/detector/none.ts b/src/detector/none.ts new file mode 100644 index 0000000..9c606f9 --- /dev/null +++ b/src/detector/none.ts @@ -0,0 +1,29 @@ +import { countSections } from "../markdown"; +import wordCounter, { segmentTextByLocale } from "../wc"; +import type { + DetectorCountSectionsOptions, + DetectorLocaleOptions, + DetectorWordCounterOptions, +} from "./types"; + +export async function segmentTextByLocaleWithRegexDetector( + text: string, + options: DetectorLocaleOptions = {}, +) { + return segmentTextByLocale(text, options); +} + +export async function wordCounterWithRegexDetector( + text: string, + options: DetectorWordCounterOptions = {}, +) { + return wordCounter(text, options); +} + +export async function countSectionsWithRegexDetector( + input: string, + section: Parameters[1], + options: DetectorCountSectionsOptions = {}, +) { + return countSections(input, section, options); +} diff --git a/src/detector/policy.ts b/src/detector/policy.ts new file mode 100644 index 0000000..6eaba4d --- /dev/null +++ b/src/detector/policy.ts @@ -0,0 +1,77 @@ +import { DEFAULT_HAN_TAG, DEFAULT_LOCALE } from "../wc/locale-detect"; + +export const LATIN_WASM_MIN_SCRIPT_CHARS = 24; +export const HANI_WASM_MIN_SCRIPT_CHARS = 12; +export const LATIN_WASM_MIN_CONFIDENCE = 0.75; +export const HANI_WASM_MIN_CONFIDENCE = 0.9; +export const LATIN_WASM_CORROBORATED_MIN_CONFIDENCE = 0.7; + +const LATIN_SCRIPT_REGEX = /\p{Script=Latin}/u; +const HAN_SCRIPT_REGEX = /\p{Script=Han}/u; + +export type DetectorRouteTag = typeof DEFAULT_LOCALE | typeof DEFAULT_HAN_TAG; + +export type DetectorRoutePolicy = { + routeTag: DetectorRouteTag; + minScriptChars: number; + minConfidence: number; + requireReliable: boolean; +}; + +export const DETECTOR_ROUTE_POLICIES: Record = { + [DEFAULT_LOCALE]: { + routeTag: DEFAULT_LOCALE, + minScriptChars: LATIN_WASM_MIN_SCRIPT_CHARS, + minConfidence: LATIN_WASM_MIN_CONFIDENCE, + requireReliable: true, + }, + [DEFAULT_HAN_TAG]: { + routeTag: DEFAULT_HAN_TAG, + minScriptChars: HANI_WASM_MIN_SCRIPT_CHARS, + minConfidence: HANI_WASM_MIN_CONFIDENCE, + requireReliable: true, + }, +}; + +export function isAmbiguousDetectorRoute(locale: string): locale is DetectorRouteTag { + return locale === DEFAULT_LOCALE || locale === DEFAULT_HAN_TAG; +} + +export function countScriptBearingCharsForRoute( + text: string, + routeTag: DetectorRouteTag, +): number { + const matcher = routeTag === DEFAULT_HAN_TAG ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX; + let count = 0; + for (const char of text) { + if (matcher.test(char)) { + count += 1; + } + } + return count; +} + +export function shouldRunWasmDetector(text: string, routeTag: DetectorRouteTag): boolean { + const policy = DETECTOR_ROUTE_POLICIES[routeTag]; + return countScriptBearingCharsForRoute(text, routeTag) >= policy.minScriptChars; +} + +export function normalizeDetectorSampleForRoute( + text: string, + routeTag: DetectorRouteTag, +): string { + const matcher = routeTag === DEFAULT_HAN_TAG ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX; + return [...text] + .map((char) => { + if (matcher.test(char)) { + return char; + } + if (/\s/u.test(char)) { + return " "; + } + return " "; + }) + .join("") + .replace(/\s+/g, " ") + .trim(); +} diff --git a/src/detector/result-builder.ts b/src/detector/result-builder.ts new file mode 100644 index 0000000..d1c26e7 --- /dev/null +++ b/src/detector/result-builder.ts @@ -0,0 +1,170 @@ +import { + analyzeCharChunk, + analyzeChunk, + aggregateByLocale, + aggregateCharsByLocale, +} from "../wc/analyze"; +import { resolveMode } from "../wc/mode"; +import { createNonWordCollection, mergeNonWordCollections } from "../wc/non-words"; +import type { + CharBreakdown, + CharCollectorBreakdown, + ChunkBreakdown, + ChunkWithSegments, + LocaleChunk, + NonWordCollection, + WordCounterMode, + WordCounterOptions, + WordCounterResult, +} from "../wc/types"; + +function getNonWordTotal(nonWords: NonWordCollection): number { + return ( + nonWords.counts.emoji + + nonWords.counts.symbols + + nonWords.counts.punctuation + + (nonWords.counts.whitespace ?? 0) + ); +} + +function collectNonWordsAggregate( + analyzed: Array<{ nonWords?: NonWordCollection }>, + enabled: boolean, +): NonWordCollection | undefined { + if (!enabled) { + return undefined; + } + const collection = createNonWordCollection(); + for (const chunk of analyzed) { + if (!chunk.nonWords) { + continue; + } + mergeNonWordCollections(collection, chunk.nonWords); + } + return collection; +} + +export function buildWordCounterResultFromChunks( + chunks: LocaleChunk[], + options: WordCounterOptions = {}, +): WordCounterResult { + const mode: WordCounterMode = resolveMode(options.mode, "chunk"); + const collectNonWords = Boolean(options.nonWords); + const includeWhitespace = Boolean(options.includeWhitespace); + + if (mode === "char" || mode === "char-collector") { + const analyzed = chunks.map((chunk) => + analyzeCharChunk(chunk, collectNonWords, includeWhitespace), + ); + const total = analyzed.reduce((sum, chunk) => sum + chunk.chars, 0); + const counts = collectNonWords + ? { + words: analyzed.reduce((sum, chunk) => sum + chunk.wordChars, 0), + nonWords: analyzed.reduce((sum, chunk) => sum + chunk.nonWordChars, 0), + total, + } + : undefined; + + if (mode === "char") { + const items: CharBreakdown[] = analyzed.map((chunk) => ({ + locale: chunk.locale, + text: chunk.text, + chars: chunk.chars, + nonWords: chunk.nonWords, + })); + return { + total, + counts, + breakdown: { + mode, + items, + }, + }; + } + + const aggregated = aggregateCharsByLocale(analyzed); + const items: CharCollectorBreakdown[] = aggregated.map((chunk) => ({ + locale: chunk.locale, + chars: chunk.chars, + nonWords: chunk.nonWords, + })); + return { + total, + counts, + breakdown: { + mode, + items, + }, + }; + } + + const analyzed = chunks.map((chunk) => + analyzeChunk(chunk, collectNonWords, includeWhitespace), + ); + const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0); + const nonWordsTotal = collectNonWords + ? analyzed.reduce((sum, chunk) => { + if (!chunk.nonWords) { + return sum; + } + return sum + getNonWordTotal(chunk.nonWords); + }, 0) + : 0; + const total = analyzed.reduce((sum, chunk) => { + let chunkTotal = chunk.words; + if (collectNonWords && chunk.nonWords) { + chunkTotal += getNonWordTotal(chunk.nonWords); + } + return sum + chunkTotal; + }, 0); + + const counts = collectNonWords ? { words: wordsTotal, nonWords: nonWordsTotal, total } : undefined; + + if (mode === "segments") { + const items: ChunkWithSegments[] = analyzed.map((chunk) => ({ + locale: chunk.locale, + text: chunk.text, + words: chunk.words, + segments: chunk.segments, + nonWords: chunk.nonWords, + })); + return { + total, + counts, + breakdown: { + mode, + items, + }, + }; + } + + if (mode === "collector") { + const items = aggregateByLocale(analyzed); + const nonWords = collectNonWordsAggregate(analyzed, collectNonWords); + return { + total, + counts, + breakdown: { + mode, + items, + nonWords, + }, + }; + } + + const items: ChunkBreakdown[] = analyzed.map((chunk) => ({ + locale: chunk.locale, + text: chunk.text, + words: chunk.words, + nonWords: chunk.nonWords, + })); + + return { + total, + counts, + breakdown: { + mode, + items, + }, + }; +} diff --git a/src/detector/sections.ts b/src/detector/sections.ts new file mode 100644 index 0000000..cada80c --- /dev/null +++ b/src/detector/sections.ts @@ -0,0 +1,104 @@ +import { parseMarkdown } from "../markdown"; +import type { SectionMode, SectionedResult } from "../markdown"; +import type { WordCounterMode, WordCounterResult } from "../wc/types"; +import type { DetectorCountSectionsOptions } from "./types"; +import { wordCounterWithDetector } from "./index"; + +function normalizeText(value: unknown): string { + if (value == null) { + return ""; + } + if (typeof value === "string") { + return value; + } + if (typeof value === "number" || typeof value === "boolean") { + return String(value); + } + try { + return JSON.stringify(value); + } catch { + return String(value); + } +} + +async function buildPerKeyItems( + data: Record | null, + options: DetectorCountSectionsOptions, +): Promise> { + if (!data || typeof data !== "object" || Array.isArray(data)) { + return []; + } + + return Promise.all( + Object.entries(data).map(async ([key, value]) => { + const valueText = normalizeText(value); + const text = valueText ? `${key}: ${valueText}` : key; + return { + name: key, + source: "frontmatter" as const, + result: await wordCounterWithDetector(text, options), + }; + }), + ); +} + +async function buildSingleItem( + name: string, + text: string, + options: DetectorCountSectionsOptions, + source: "frontmatter" | "content", +): Promise> { + return [{ name, source, result: await wordCounterWithDetector(text, options) }]; +} + +function sumTotals(items: Array<{ result: WordCounterResult }>): number { + return items.reduce((sum, item) => sum + item.result.total, 0); +} + +export async function countSectionsWithResolvedDetector( + input: string, + section: SectionMode, + options: DetectorCountSectionsOptions = {}, +): Promise { + const mode: WordCounterMode = options.mode ?? "chunk"; + if (section === "all") { + const result = await wordCounterWithDetector(input, options); + return { + section, + total: result.total, + frontmatterType: null, + items: [{ name: "all", source: "content", result }], + }; + } + + const parsed = parseMarkdown(input); + const frontmatterText = parsed.frontmatter ?? ""; + const contentText = parsed.content ?? ""; + + let items: Array<{ name: string; source: "frontmatter" | "content"; result: WordCounterResult }> = []; + + if (section === "frontmatter") { + items = await buildSingleItem("frontmatter", frontmatterText, options, "frontmatter"); + } else if (section === "content") { + items = await buildSingleItem("content", contentText, options, "content"); + } else if (section === "split") { + items = [ + ...(await buildSingleItem("frontmatter", frontmatterText, options, "frontmatter")), + ...(await buildSingleItem("content", contentText, options, "content")), + ]; + } else if (section === "per-key") { + items = await buildPerKeyItems(parsed.data, options); + } else if (section === "split-per-key") { + items = [ + ...(await buildPerKeyItems(parsed.data, options)), + ...(await buildSingleItem("content", contentText, options, "content")), + ]; + } + + return { + section, + total: sumTotals(items), + frontmatterType: parsed.frontmatterType, + items, + }; +} diff --git a/src/detector/types.ts b/src/detector/types.ts new file mode 100644 index 0000000..da76581 --- /dev/null +++ b/src/detector/types.ts @@ -0,0 +1,34 @@ +import type { SectionedResult, SectionMode } from "../markdown"; +import type { LocaleDetectOptions } from "../wc/locale-detect"; +import type { WordCounterOptions, WordCounterResult } from "../wc/types"; + +export type DetectorMode = "regex" | "wasm"; + +export type DetectorSource = "script" | "hint" | "wasm"; + +export interface DetectorResult { + tag: string; + confidence?: number; + reliable?: boolean; + source: DetectorSource; +} + +export interface DetectorRuntimeOptions { + detector?: DetectorMode; +} + +export interface DetectorLocaleOptions extends LocaleDetectOptions, DetectorRuntimeOptions {} + +export interface DetectorWordCounterOptions + extends WordCounterOptions, + DetectorRuntimeOptions {} + +export type DetectorCountSectionsOptions = DetectorWordCounterOptions; + +export type DetectorCountResult = WordCounterResult | SectionedResult; + +export type DetectorCountSections = ( + input: string, + section: SectionMode, + options?: DetectorCountSectionsOptions, +) => Promise; diff --git a/src/detector/wasm.ts b/src/detector/wasm.ts new file mode 100644 index 0000000..1574b99 --- /dev/null +++ b/src/detector/wasm.ts @@ -0,0 +1,176 @@ +import { DEFAULT_HAN_TAG, DEFAULT_LOCALE } from "../wc/locale-detect"; +import { segmentTextByLocale } from "../wc"; +import type { LocaleChunk } from "../wc/types"; +import { buildWordCounterResultFromChunks } from "./result-builder"; +import { countSectionsWithResolvedDetector } from "./sections"; +import { + DETECTOR_ROUTE_POLICIES, + LATIN_WASM_CORROBORATED_MIN_CONFIDENCE, + isAmbiguousDetectorRoute, + normalizeDetectorSampleForRoute, + shouldRunWasmDetector, + type DetectorRouteTag, +} from "./policy"; +import { detectWithWhatlangWasm, WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE } from "./whatlang-wasm"; +import { getDetectorFallbackTag, remapWhatlangResult } from "./whatlang-map"; +import type { + DetectorCountSectionsOptions, + DetectorLocaleOptions, + DetectorWordCounterOptions, +} from "./types"; + +function shouldAcceptDetectorTag( + routeTag: DetectorRouteTag, + confidence: number | undefined, + reliable: boolean | undefined, +): boolean { + const policy = DETECTOR_ROUTE_POLICIES[routeTag]; + if (policy.requireReliable && reliable !== true) { + return false; + } + + if (confidence === undefined) { + return false; + } + + return confidence >= policy.minConfidence; +} + +type DetectorWindow = { + routeTag: DetectorRouteTag; + startIndex: number; + endIndex: number; + text: string; +}; + +function buildDetectorWindows(chunks: LocaleChunk[]): DetectorWindow[] { + const windows: DetectorWindow[] = []; + + for (let index = 0; index < chunks.length; index += 1) { + const chunk = chunks[index]; + if (!chunk || !isAmbiguousDetectorRoute(chunk.locale)) { + continue; + } + + const previousWindow = windows[windows.length - 1]; + if ( + previousWindow && + previousWindow.routeTag === chunk.locale && + previousWindow.endIndex === index - 1 + ) { + previousWindow.endIndex = index; + previousWindow.text += chunk.text; + continue; + } + + windows.push({ + routeTag: chunk.locale, + startIndex: index, + endIndex: index, + text: chunk.text, + }); + } + + return windows; +} + +async function resolveWindowLocale(window: DetectorWindow): Promise { + if (!shouldRunWasmDetector(window.text, window.routeTag)) { + return window.routeTag; + } + + const rawResult = await detectWithWhatlangWasm(window.text, window.routeTag); + const rawRemapped = rawResult ? remapWhatlangResult(rawResult, window.routeTag) : null; + + const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag); + const normalizedResult = + normalizedSample.length > 0 && normalizedSample !== window.text + ? await detectWithWhatlangWasm(normalizedSample, window.routeTag) + : null; + const normalizedRemapped = normalizedResult + ? remapWhatlangResult(normalizedResult, window.routeTag) + : null; + + const candidates = [rawRemapped, normalizedRemapped].filter((value) => value !== null); + if (candidates.length === 0) { + return getDetectorFallbackTag(window.routeTag); + } + + const strongestCandidate = candidates.reduce((best, current) => { + if (!best) { + return current; + } + return (current.confidence ?? 0) > (best.confidence ?? 0) ? current : best; + }, candidates[0]); + + if ( + strongestCandidate && + shouldAcceptDetectorTag( + window.routeTag, + strongestCandidate.confidence, + strongestCandidate.reliable, + ) + ) { + return strongestCandidate.tag; + } + + if ( + window.routeTag === DEFAULT_LOCALE && + rawRemapped && + normalizedRemapped && + rawRemapped.tag === normalizedRemapped.tag + ) { + const corroboratedConfidence = Math.max( + rawRemapped.confidence ?? 0, + normalizedRemapped.confidence ?? 0, + ); + if (corroboratedConfidence >= LATIN_WASM_CORROBORATED_MIN_CONFIDENCE) { + return rawRemapped.tag; + } + } + + return getDetectorFallbackTag(window.routeTag); +} + +export { WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE }; + +export async function segmentTextByLocaleWithWasmDetector( + text: string, + options: DetectorLocaleOptions = {}, +) { + const chunks = segmentTextByLocale(text, options); + const resolved = [...chunks]; + const windows = buildDetectorWindows(chunks); + + for (const window of windows) { + const resolvedLocale = await resolveWindowLocale(window); + for (let index = window.startIndex; index <= window.endIndex; index += 1) { + const chunk = resolved[index]; + if (!chunk) { + continue; + } + resolved[index] = { + ...chunk, + locale: resolvedLocale, + }; + } + } + + return resolved; +} + +export async function wordCounterWithWasmDetector( + text: string, + options: DetectorWordCounterOptions = {}, +) { + const chunks = await segmentTextByLocaleWithWasmDetector(text, options); + return buildWordCounterResultFromChunks(chunks, options); +} + +export async function countSectionsWithWasmDetector( + input: string, + section: Parameters[1], + options: DetectorCountSectionsOptions = {}, +) { + return countSectionsWithResolvedDetector(input, section, options); +} diff --git a/src/detector/whatlang-map.ts b/src/detector/whatlang-map.ts new file mode 100644 index 0000000..5f774da --- /dev/null +++ b/src/detector/whatlang-map.ts @@ -0,0 +1,79 @@ +import { DEFAULT_HAN_TAG, DEFAULT_LOCALE } from "../wc/locale-detect"; +import type { DetectorRouteTag } from "./policy"; +import type { DetectorResult } from "./types"; + +export interface WhatlangWasmResult { + lang: string; + script: string; + confidence: number; + reliable: boolean; +} + +const LATIN_LANGUAGE_TAGS: Record = { + cat: "ca", + ces: "cs", + dan: "da", + deu: "de", + eng: "en", + fin: "fi", + fra: "fr", + hun: "hu", + ita: "it", + lat: "la", + nld: "nl", + pol: "pl", + por: "pt", + ron: "ro", + spa: "es", + swe: "sv", + tur: "tr", +}; + +const HANI_LANGUAGE_TAGS: Record = { + cmn: "zh", + jpn: "ja", +}; + +function hasSupportedScript(result: WhatlangWasmResult, routeTag: DetectorRouteTag): boolean { + if (routeTag === DEFAULT_LOCALE) { + return result.script === "Latin"; + } + + return result.script === "Mandarin"; +} + +function remapLanguageTag( + lang: string, + routeTag: DetectorRouteTag, +): string | undefined { + if (routeTag === DEFAULT_LOCALE) { + return LATIN_LANGUAGE_TAGS[lang]; + } + + return HANI_LANGUAGE_TAGS[lang]; +} + +export function remapWhatlangResult( + result: WhatlangWasmResult, + routeTag: DetectorRouteTag, +): DetectorResult | null { + if (!hasSupportedScript(result, routeTag)) { + return null; + } + + const tag = remapLanguageTag(result.lang, routeTag); + if (!tag) { + return null; + } + + return { + tag, + confidence: result.confidence, + reliable: result.reliable, + source: "wasm", + }; +} + +export function getDetectorFallbackTag(routeTag: DetectorRouteTag): string { + return routeTag === DEFAULT_HAN_TAG ? DEFAULT_HAN_TAG : DEFAULT_LOCALE; +} diff --git a/src/detector/whatlang-wasm.ts b/src/detector/whatlang-wasm.ts new file mode 100644 index 0000000..e0fa130 --- /dev/null +++ b/src/detector/whatlang-wasm.ts @@ -0,0 +1,68 @@ +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { createRequire } from "node:module"; +import { fileURLToPath } from "node:url"; +import type { DetectorRouteTag } from "./policy"; +import type { WhatlangWasmResult } from "./whatlang-map"; + +const GENERATED_FOLDER_NAME = "wasm-language-detector"; +const GENERATED_MODULE_FILE = "language_detector.js"; +const MAX_SEARCH_DEPTH = 8; +const requireFromHere = createRequire(import.meta.url); + +export const WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE = + "WASM detector runtime is unavailable. Run `bun run build:wasm` to generate it."; + +type WhatlangWasmModule = { + detect_language: (text: string, routeTag: string) => WhatlangWasmResult | null; +}; + +let modulePromise: Promise | null = null; + +function resolveCandidateModulePaths(): string[] { + const moduleDir = dirname(fileURLToPath(import.meta.url)); + const candidates = new Set(); + let currentDir = moduleDir; + + for (let depth = 0; depth < MAX_SEARCH_DEPTH; depth += 1) { + candidates.add(join(currentDir, GENERATED_FOLDER_NAME, GENERATED_MODULE_FILE)); + candidates.add(join(currentDir, "generated", GENERATED_FOLDER_NAME, GENERATED_MODULE_FILE)); + + const parentDir = dirname(currentDir); + if (parentDir === currentDir) { + break; + } + currentDir = parentDir; + } + + return [...candidates]; +} + +function resolveWhatlangWasmModulePath(): string { + for (const candidate of resolveCandidateModulePaths()) { + if (existsSync(candidate)) { + return candidate; + } + } + + throw new Error(WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE); +} + +async function loadWhatlangWasmModule(): Promise { + if (!modulePromise) { + modulePromise = (async () => { + const modulePath = resolveWhatlangWasmModulePath(); + return requireFromHere(modulePath) as WhatlangWasmModule; + })(); + } + + return modulePromise; +} + +export async function detectWithWhatlangWasm( + text: string, + routeTag: DetectorRouteTag, +): Promise { + const wasmModule = await loadWhatlangWasmModule(); + return wasmModule.detect_language(text, routeTag); +} diff --git a/test/cjs-interop.test.ts b/test/cjs-interop.test.ts index ef4a427..87e44a9 100644 --- a/test/cjs-interop.test.ts +++ b/test/cjs-interop.test.ts @@ -3,6 +3,7 @@ import { createRequire } from "node:module"; const require = createRequire(import.meta.url); const cjs = require("../dist/cjs/index.cjs"); +const detectorCjs = require("../dist/cjs/detector.cjs"); describe("CJS wrapper interop", () => { test("require() returns default function with named properties", () => { @@ -18,4 +19,10 @@ describe("CJS wrapper interop", () => { const result = cjs("Hello world"); expect(result.total).toBe(2); }); + + test("detector wrapper exports the runtime unavailable message", () => { + expect(detectorCjs.WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE).toBe( + "WASM detector runtime is unavailable. Run `bun run build:wasm` to generate it.", + ); + }); }); diff --git a/test/command.test.ts b/test/command.test.ts index ad3a3bb..2b058af 100644 --- a/test/command.test.ts +++ b/test/command.test.ts @@ -1,4 +1,5 @@ import { afterEach, describe, expect, test } from "bun:test"; +import { spawnSync } from "node:child_process"; import { mkdtemp, mkdir, readFile, readdir, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import { basename, join } from "node:path"; @@ -18,6 +19,7 @@ import { validateStandalonePrintJobsLimitUsage, } from "../src/cli/runtime/options"; import { TOTAL_OF_PARTS } from "../src/cli/total-of"; +import { hasWasmDetectorRuntime } from "./support/wasm-detector-runtime"; const tempRoots: string[] = []; @@ -211,6 +213,54 @@ describe("batch path resolution", () => { }); }); +describe("detector mode", () => { + test("keeps regex as the default detector mode", async () => { + const output = await captureCli(["--format", "json", "Hello world"]); + + expect(output.exitCode).toBe(0); + expect(JSON.parse(output.stdout[0] ?? "{}")).toMatchObject({ total: 2 }); + }); + + test("accepts explicit regex detector mode", async () => { + const output = await captureCli(["--detector", "regex", "--format", "json", "Hello world"]); + + expect(output.exitCode).toBe(0); + expect(JSON.parse(output.stdout[0] ?? "{}")).toMatchObject({ total: 2 }); + }); + + test("supports wasm detector mode for long ambiguous Latin text", async () => { + if (!hasWasmDetectorRuntime()) { + return; + } + + const output = await captureCli([ + "--detector", + "wasm", + "--format", + "json", + "This sentence should clearly be detected as English for the wasm detector path.", + ]); + + expect(output.exitCode).toBe(0); + const parsed = JSON.parse(output.stdout[0] ?? "{}"); + expect(parsed.breakdown.items[0]?.locale).toBe("en"); + }); + + test("rejects invalid detector mode values", () => { + const result = spawnSync( + process.execPath, + ["run", "src/bin.ts", "--detector", "invalid", "Hello world"], + { + cwd: process.cwd(), + encoding: "utf8", + }, + ); + + expect(result.status).toBe(1); + expect(result.stderr).toContain("option '--detector ' argument 'invalid' is invalid"); + }); +}); + describe("batch aggregation", () => { test("keeps merged breakdown order deterministic across files", async () => { const root = await makeTempFixture("batch-aggregate-order"); diff --git a/test/detector-interop.test.ts b/test/detector-interop.test.ts new file mode 100644 index 0000000..5c71bc2 --- /dev/null +++ b/test/detector-interop.test.ts @@ -0,0 +1,12 @@ +import { describe, expect, test } from "bun:test"; + +describe("detector subpath interop", () => { + test("ESM detector entry is reachable", async () => { + const detector = await import("../src/detector/index.ts"); + + expect(detector.DEFAULT_DETECTOR_MODE).toBe("regex"); + await expect( + detector.wordCounterWithDetector("Hello world", { detector: "regex" }), + ).resolves.toMatchObject({ total: 2 }); + }); +}); diff --git a/test/package-types.test.ts b/test/package-types.test.ts new file mode 100644 index 0000000..e0a6823 --- /dev/null +++ b/test/package-types.test.ts @@ -0,0 +1,65 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"; +import { createRequire } from "node:module"; +import { join } from "node:path"; +import { spawnSync } from "node:child_process"; + +const tempRoots: string[] = []; +const require = createRequire(import.meta.url); +const tscEntrypoint = require.resolve("typescript/bin/tsc"); + +afterEach(async () => { + await Promise.all( + tempRoots.splice(0).map((path) => rm(path, { recursive: true, force: true })), + ); +}); + +async function makeTypecheckFixture(): Promise { + const generatedRoot = join(process.cwd(), "generated"); + await mkdir(generatedRoot, { recursive: true }); + const root = await mkdtemp(join(generatedRoot, "typecheck-")); + tempRoots.push(root); + return root; +} + +describe("published package types", () => { + test("root package supports default and named imports", async () => { + const fixtureRoot = await makeTypecheckFixture(); + const entryPath = join(fixtureRoot, "root-imports.mts"); + + await writeFile( + entryPath, + [ + "import wc, { wordCounter, countSections } from '@dev-pi2pie/word-counter';", + "wc('Hello world');", + "wordCounter('Hello world');", + "countSections('Hello world', 'all');", + ].join("\n"), + ); + + const result = spawnSync( + process.execPath, + [ + tscEntrypoint, + "--noEmit", + "--pretty", + "false", + "--module", + "NodeNext", + "--moduleResolution", + "NodeNext", + "--target", + "ES2022", + "--skipLibCheck", + entryPath, + ], + { + cwd: process.cwd(), + encoding: "utf8", + }, + ); + + expect(result.status).toBe(0); + expect(result.stderr).toBe(""); + }); +}); diff --git a/test/support/wasm-detector-runtime.ts b/test/support/wasm-detector-runtime.ts new file mode 100644 index 0000000..6585c3c --- /dev/null +++ b/test/support/wasm-detector-runtime.ts @@ -0,0 +1,14 @@ +import { existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const testDir = dirname(fileURLToPath(import.meta.url)); +const repoRoot = join(testDir, "..", ".."); +const runtimeCandidates = [ + join(repoRoot, "generated", "wasm-language-detector", "language_detector.js"), + join(repoRoot, "dist", "wasm-language-detector", "language_detector.js"), +]; + +export function hasWasmDetectorRuntime(): boolean { + return runtimeCandidates.some((candidate) => existsSync(candidate)); +} diff --git a/test/word-counter.test.ts b/test/word-counter.test.ts index d2d3d4a..e552662 100644 --- a/test/word-counter.test.ts +++ b/test/word-counter.test.ts @@ -5,6 +5,12 @@ import wordCounter, { countWordsForLocale, segmentTextByLocale, } from "../src/wc"; +import { + countSectionsWithDetector, + segmentTextByLocaleWithDetector, + wordCounterWithDetector, +} from "../src/detector"; +import { hasWasmDetectorRuntime } from "./support/wasm-detector-runtime"; describe("wordCounter", () => { test("counts Latin words in chunk mode by default", () => { @@ -80,6 +86,82 @@ describe("wordCounter", () => { }); }); +describe("detector entrypoint", () => { + test("uses regex detector mode by default", async () => { + const result = await wordCounterWithDetector("Hello world"); + + expect(result.total).toBe(2); + }); + + test("supports explicit regex detector mode", async () => { + const result = await wordCounterWithDetector("Hello world", { detector: "regex" }); + + expect(result.total).toBe(2); + }); + + test("keeps short ambiguous Latin chunks on und-Latn in wasm mode", async () => { + const result = await wordCounterWithDetector("Hello world", { detector: "wasm" }); + + expect(result.breakdown.mode).toBe("chunk"); + expect(result.breakdown.items[0]?.locale).toBe("und-Latn"); + }); + + test("promotes long ambiguous Latin chunks in wasm mode", async () => { + if (!hasWasmDetectorRuntime()) { + return; + } + + const result = await wordCounterWithDetector( + "This sentence should clearly be detected as English for the wasm detector path.", + { detector: "wasm" }, + ); + + expect(result.breakdown.mode).toBe("chunk"); + expect(result.breakdown.items[0]?.locale).toBe("en"); + }); + + test("promotes corroborated markdown-like Latin text in wasm mode", async () => { + if (!hasWasmDetectorRuntime()) { + return; + } + + const result = await wordCounterWithDetector( + ["---", "title: Alpha Story", "summary: Intro note", "---", "Hello world from alpha."].join( + "\n", + ), + { detector: "wasm" }, + ); + + expect(result.breakdown.mode).toBe("chunk"); + expect(result.breakdown.items[0]?.locale).toBe("en"); + }); + + test("keeps low-confidence short English-like text on und-Latn in wasm mode", async () => { + if (!hasWasmDetectorRuntime()) { + return; + } + + const result = await wordCounterWithDetector("Plain text file for batch counting.", { + detector: "wasm", + }); + + expect(result.breakdown.mode).toBe("chunk"); + expect(result.breakdown.items[0]?.locale).toBe("und-Latn"); + }); + + test("segments text through detector entrypoint", async () => { + const chunks = await segmentTextByLocaleWithDetector("Hello 世界", { detector: "regex" }); + + expect(chunks.map((chunk) => chunk.locale)).toEqual(["und-Latn", "und-Hani"]); + }); + + test("counts sections through detector entrypoint", async () => { + const result = await countSectionsWithDetector("Hello world", "all", { detector: "regex" }); + + expect(result.total).toBe(2); + }); +}); + describe("segmentTextByLocale", () => { test("splits Latin and Han scripts into separate locales", () => { const chunks = segmentTextByLocale("Hello 世界"); diff --git a/tsdown.config.ts b/tsdown.config.ts index d167169..bef95cc 100644 --- a/tsdown.config.ts +++ b/tsdown.config.ts @@ -2,7 +2,7 @@ import { defineConfig } from "tsdown"; export default defineConfig([ { - entry: { index: "src/index.ts" }, + entry: { index: "src/index.ts", detector: "src/detector/index.ts" }, format: ["esm"], dts: true, outDir: "dist/esm", @@ -14,7 +14,7 @@ export default defineConfig([ fixedExtension: true, }, { - entry: { index: "src/index.cjs.ts" }, + entry: { index: "src/index.cjs.ts", detector: "src/detector/index.cjs.ts" }, format: ["cjs"], dts: false, outDir: "dist/cjs",