From 8e3dbeba7d9803c9b7074ade003faa8f64f47b06 Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Sun, 26 May 2024 17:19:30 +0800 Subject: [PATCH 1/4] wip --- bin/sample-config.json | 3 + .../fixtures/workspace/.vscode/settings.json | 5 + client/web-playground/package.json | 1 + client/web-playground/tsconfig.json | 1 + client/web-playground/vite.config.ts | 20 +- pnpm-lock.yaml | 526 +++++++++++++++--- provider/docs/.gitignore | 2 + provider/docs/README.md | 75 +++ provider/docs/bin/create-archive.ts | 95 ++++ provider/docs/bin/create-index.ts | 60 ++ provider/docs/bin/search.ts | 70 +++ provider/docs/package.json | 40 ++ provider/docs/src/client/client.ts | 45 ++ provider/docs/src/client/search.ts | 76 +++ .../docs/src/corpus/archive/corpusArchive.ts | 33 ++ .../docs/src/corpus/archive/web/crawlQueue.ts | 45 ++ .../archive/web/webCorpusArchive.test.ts | 128 +++++ .../corpus/archive/web/webCorpusArchive.ts | 140 +++++ .../docs/src/corpus/cache/contentID.test.ts | 10 + provider/docs/src/corpus/cache/contentID.ts | 19 + provider/docs/src/corpus/doc/chunks.test.ts | 79 +++ provider/docs/src/corpus/doc/chunks.ts | 94 ++++ .../src/corpus/doc/contentExtractor.test.ts | 17 + .../docs/src/corpus/doc/contentExtractor.ts | 49 ++ provider/docs/src/corpus/doc/doc.ts | 14 + .../docs/src/corpus/index/corpusIndex.test.ts | 24 + provider/docs/src/corpus/index/corpusIndex.ts | 108 ++++ provider/docs/src/dom.ts | 23 + provider/docs/src/e2e.test.ts | 40 ++ provider/docs/src/env.ts | 3 + provider/docs/src/logger.ts | 1 + provider/docs/src/polyfill1.js | 6 + provider/docs/src/provider/multiplex.ts | 28 + provider/docs/src/provider/provider.ts | 146 +++++ provider/docs/src/search/embeddings.test.ts | 35 ++ provider/docs/src/search/embeddings.ts | 128 +++++ provider/docs/src/search/keyword.test.ts | 32 ++ provider/docs/src/search/keyword.ts | 25 + provider/docs/src/search/terms.test.ts | 13 + provider/docs/src/search/terms.ts | 181 ++++++ provider/docs/src/search/tfidf.test.ts | 55 ++ provider/docs/src/search/tfidf.ts | 141 +++++ provider/docs/src/search/types.ts | 26 + provider/docs/src/testdata/code/urlParsing.ts | 6 + .../docs/src/testdata/corpus/urlParsing.md | 15 + provider/docs/src/worker/api.ts | 11 + provider/docs/src/worker/webWorker.ts | 28 + provider/docs/src/worker/webWorkerClient.ts | 68 +++ provider/docs/tsconfig.json | 15 + provider/docs/vitest.config.ts | 3 + tsconfig.json | 1 + 51 files changed, 2742 insertions(+), 67 deletions(-) create mode 100644 provider/docs/.gitignore create mode 100644 provider/docs/README.md create mode 100644 provider/docs/bin/create-archive.ts create mode 100644 provider/docs/bin/create-index.ts create mode 100644 provider/docs/bin/search.ts create mode 100644 provider/docs/package.json create mode 100644 provider/docs/src/client/client.ts create mode 100644 provider/docs/src/client/search.ts create mode 100644 provider/docs/src/corpus/archive/corpusArchive.ts create mode 100644 provider/docs/src/corpus/archive/web/crawlQueue.ts create mode 100644 provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts create mode 100644 provider/docs/src/corpus/archive/web/webCorpusArchive.ts create mode 100644 provider/docs/src/corpus/cache/contentID.test.ts create mode 100644 provider/docs/src/corpus/cache/contentID.ts create mode 100644 provider/docs/src/corpus/doc/chunks.test.ts create mode 100644 provider/docs/src/corpus/doc/chunks.ts create mode 100644 provider/docs/src/corpus/doc/contentExtractor.test.ts create mode 100644 provider/docs/src/corpus/doc/contentExtractor.ts create mode 100644 provider/docs/src/corpus/doc/doc.ts create mode 100644 provider/docs/src/corpus/index/corpusIndex.test.ts create mode 100644 provider/docs/src/corpus/index/corpusIndex.ts create mode 100644 provider/docs/src/dom.ts create mode 100644 provider/docs/src/e2e.test.ts create mode 100644 provider/docs/src/env.ts create mode 100644 provider/docs/src/logger.ts create mode 100644 provider/docs/src/polyfill1.js create mode 100644 provider/docs/src/provider/multiplex.ts create mode 100644 provider/docs/src/provider/provider.ts create mode 100644 provider/docs/src/search/embeddings.test.ts create mode 100644 provider/docs/src/search/embeddings.ts create mode 100644 provider/docs/src/search/keyword.test.ts create mode 100644 provider/docs/src/search/keyword.ts create mode 100644 provider/docs/src/search/terms.test.ts create mode 100644 provider/docs/src/search/terms.ts create mode 100644 provider/docs/src/search/tfidf.test.ts create mode 100644 provider/docs/src/search/tfidf.ts create mode 100644 provider/docs/src/search/types.ts create mode 100644 provider/docs/src/testdata/code/urlParsing.ts create mode 100644 provider/docs/src/testdata/corpus/urlParsing.md create mode 100644 provider/docs/src/worker/api.ts create mode 100644 provider/docs/src/worker/webWorker.ts create mode 100644 provider/docs/src/worker/webWorkerClient.ts create mode 100644 provider/docs/tsconfig.json create mode 100644 provider/docs/vitest.config.ts diff --git a/bin/sample-config.json b/bin/sample-config.json index 8f4b3514..14c12294 100644 --- a/bin/sample-config.json +++ b/bin/sample-config.json @@ -13,6 +13,9 @@ "path": "**" } ] + }, + "file:///Users/sqs/src/github.com/sourcegraph/openctx/provider/docs/dist/provider.mjs": { + "index": "file:///Users/sqs/tmp/octx-provider-docs/anthropic-docs-web-index.json" } } } diff --git a/client/vscode/test/fixtures/workspace/.vscode/settings.json b/client/vscode/test/fixtures/workspace/.vscode/settings.json index 501bb015..b9c83bf0 100644 --- a/client/vscode/test/fixtures/workspace/.vscode/settings.json +++ b/client/vscode/test/fixtures/workspace/.vscode/settings.json @@ -2,6 +2,11 @@ "openctx.enable": true, "openctx.debug": true, "openctx.providers": { + //"../../../../../../provider/docs/dist/provider.cjs": { + // // "index": "http://localhost:5900/@fs/home/sqs/tmp/openctx-provider-docs/vite-docs-web.index.json", + // // "index": "file:///home/sqs/tmp/openctx-provider-docs/vite-docs-web.index.json", + // "index": "file:///Users/sqs/tmp/octx-provider-docs/sourcegraph-handbook-web-index.json", + //}, // "https://sourcegraph.test:3443/.api/openctx": true, // "https://openctx.org/npm/@openctx/provider-hello-world": true, "../../../../../../provider/hello-world/dist/index.js": true, diff --git a/client/web-playground/package.json b/client/web-playground/package.json index fc4fc94d..4d3b6ed7 100644 --- a/client/web-playground/package.json +++ b/client/web-playground/package.json @@ -24,6 +24,7 @@ "@codemirror/lint": "^6.4.2", "@openctx/client": "workspace:*", "@openctx/codemirror-extension": "workspace:*", + "@openctx/provider-docs": "workspace:*", "@openctx/provider-links": "workspace:*", "@openctx/provider-storybook": "workspace:*", "@openctx/ui-react": "workspace:*", diff --git a/client/web-playground/tsconfig.json b/client/web-playground/tsconfig.json index f1b6e459..807dd440 100644 --- a/client/web-playground/tsconfig.json +++ b/client/web-playground/tsconfig.json @@ -18,5 +18,6 @@ { "path": "../../lib/client" }, { "path": "../../lib/ui-react" }, { "path": "../../provider/links" }, + { "path": "../../provider/docs" }, ], } diff --git a/client/web-playground/vite.config.ts b/client/web-playground/vite.config.ts index 29d29ece..5b769f18 100644 --- a/client/web-playground/vite.config.ts +++ b/client/web-playground/vite.config.ts @@ -1,12 +1,15 @@ import { resolve } from 'path' import react from '@vitejs/plugin-react' -import { defineConfig } from 'vite' +import { defineConfig, searchForWorkspaceRoot } from 'vite' + +// TODO(sqs): un-hardcode +const docsProviderDataDir = resolve('/Users/sqs/tmp/octx-provider-docs') export default defineConfig(({ mode }) => ({ plugins: [react()], resolve: { - alias: - mode === 'development' + alias: [ + ...(mode === 'development' ? [ // In dev mode, build from TypeScript sources so we don't need to run `tsc -b` // in the background. @@ -17,8 +20,14 @@ export default defineConfig(({ mode }) => ({ replacement: '$1/src/index', }, ] - : [], + : []), + { + find: 'tmp-octx-provider-docs', + replacement: docsProviderDataDir, + }, + ], }, + define: {}, css: { devSourcemap: true, modules: { @@ -27,6 +36,9 @@ export default defineConfig(({ mode }) => ({ }, server: { port: 5900, + fs: { + allow: [searchForWorkspaceRoot(process.cwd()), docsProviderDataDir], + }, }, build: { emptyOutDir: false, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 61806055..82527c8c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -72,7 +72,7 @@ importers: version: 5.2.11(@types/node@20.10.0) vitest: specifier: ^1.6.0 - version: 1.6.0(@types/node@20.10.0) + version: 1.6.0(@types/node@20.10.0)(jsdom@23.2.0) bin: dependencies: @@ -341,6 +341,9 @@ importers: '@openctx/codemirror-extension': specifier: workspace:* version: link:../codemirror + '@openctx/provider-docs': + specifier: workspace:* + version: link:../../provider/docs '@openctx/provider-links': specifier: workspace:* version: link:../../provider/links @@ -492,6 +495,40 @@ importers: specifier: ^2.1.0 version: 2.1.0 + provider/docs: + dependencies: + '@mozilla/readability': + specifier: ^0.5.0 + version: 0.5.0 + '@openctx/provider': + specifier: workspace:* + version: link:../../lib/provider + '@xenova/transformers': + specifier: ^2.13.4 + version: 2.17.1 + buffer: + specifier: ^6.0.3 + version: 6.0.3 + jsdom: + specifier: ^23.2.0 + version: 23.2.0 + lru-cache: + specifier: ^10.1.0 + version: 10.1.0 + onnxruntime-web: + specifier: '1.16' + version: 1.16.3 + devDependencies: + '@types/jsdom': + specifier: ^21.1.6 + version: 21.1.6 + esbuild: + specifier: ^0.19.11 + version: 0.19.12 + vitest-fetch-mock: + specifier: ^0.2.2 + version: 0.2.2(vitest@1.6.0) + provider/google-docs: dependencies: '@googleapis/docs': @@ -719,6 +756,13 @@ packages: '@jridgewell/trace-mapping': 0.3.18 dev: true + /@asamuzakjp/dom-selector@2.0.2: + resolution: {integrity: sha512-x1KXOatwofR6ZAYzXRBL5wrdV0vwNxlTCK9NCuLqAzQYARqGcvFwiJA6A1ERuh+dgeA4Dxm3JBYictIes+SqUQ==} + dependencies: + bidi-js: 1.0.3 + css-tree: 2.3.1 + is-potential-custom-element-name: 1.0.1 + /@aw-web-design/x-default-browser@1.4.126: resolution: {integrity: sha512-Xk1sIhyNC/esHGGVjL/niHLowM0csl/kFO5uawBy4IrWwy0o1G8LGt3jP6nmWGz+USxeeqbihAmp/oVZju6wug==} hasBin: true @@ -3457,6 +3501,11 @@ packages: - supports-color dev: false + /@huggingface/jinja@0.2.2: + resolution: {integrity: sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==} + engines: {node: '>=18'} + dev: false + /@isaacs/cliui@8.0.2: resolution: {integrity: sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==} engines: {node: '>=12'} @@ -3735,6 +3784,11 @@ packages: - supports-color dev: false + /@mozilla/readability@0.5.0: + resolution: {integrity: sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==} + engines: {node: '>=14.0.0'} + dev: false + /@ndelangen/get-tarball@3.0.9: resolution: {integrity: sha512-9JKTEik4vq+yGosHYhZ1tiH/3WpUS0Nh0kej4Agndhox8pAdWhEx5knFVRcb/ya9knCRCs1rPxNrSXTDdfVqpA==} dependencies: @@ -3795,6 +3849,49 @@ packages: resolution: {integrity: sha512-2LuNTFBIO0m7kKIQvvPHN6UE63VjpmL9rnEEaOOaiSPbZK+zUOYIzBAWcED+3XYzhYsd/0mD57VdxAEqqV52CQ==} dev: false + /@protobufjs/aspromise@1.1.2: + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + dev: false + + /@protobufjs/base64@1.1.2: + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + dev: false + + /@protobufjs/codegen@2.0.4: + resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==} + dev: false + + /@protobufjs/eventemitter@1.1.0: + resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==} + dev: false + + /@protobufjs/fetch@1.1.0: + resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==} + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/inquire': 1.1.0 + dev: false + + /@protobufjs/float@1.0.2: + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + dev: false + + /@protobufjs/inquire@1.1.0: + resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==} + dev: false + + /@protobufjs/path@1.1.2: + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + dev: false + + /@protobufjs/pool@1.1.0: + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + dev: false + + /@protobufjs/utf8@1.1.0: + resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==} + dev: false + /@radix-ui/number@1.0.1: resolution: {integrity: sha512-T5gIdVO2mmPW3NNhjNgEP3cqMXjXL9UbO0BzWcXfvdBs+BohbQxvd/K5hSVKmn9/lbTdsQVKbUcP5WLCwvUbBg==} dependencies: @@ -6136,6 +6233,14 @@ packages: resolution: {integrity: sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==} dev: true + /@types/jsdom@21.1.6: + resolution: {integrity: sha512-/7kkMsC+/kMs7gAYmmBR9P0vGTnOoLhQhyhQJSlXGI5bzTHp6xdo0TtKWQAsz6pmSAeVqKSbqeyP6hytqr9FDw==} + dependencies: + '@types/node': 20.11.20 + '@types/tough-cookie': 4.0.5 + parse5: 7.1.2 + dev: true + /@types/json-schema@7.0.14: resolution: {integrity: sha512-U3PUjAudAdJBeC2pgN8uTIKgxrb4nlDF3SF0++EldXQvQBGkpFZMSnwQiIoDU77tv45VgNkl/L4ouD+rEomujw==} dev: true @@ -6144,6 +6249,10 @@ packages: resolution: {integrity: sha512-Hwx9EUgdwf2GLarOjQp5ZH8ZmblzcbTBC2wtQWNKARBSxM9ezRIAUpeDTgoQRAFB0+8CNWXVA9+MaSOzOF3nPg==} dev: true + /@types/long@4.0.2: + resolution: {integrity: sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==} + dev: false + /@types/mdast@4.0.3: resolution: {integrity: sha512-LsjtqsyF+d2/yFOYaN22dHZI1Cpwkrj+g06G8+qtUKlhovPW89YhqSnfKtMbkgmEtYpH2gydRNULd6y8mciAFg==} dependencies: @@ -6274,6 +6383,10 @@ packages: '@types/node': 20.11.20 dev: true + /@types/tough-cookie@4.0.5: + resolution: {integrity: sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==} + dev: true + /@types/unist@2.0.3: resolution: {integrity: sha512-FvUupuM3rlRsRtCN+fDudtmytGO6iHJuuRKS1Ss0pG5z8oX0diNEw94UEL7hgDbpN94rgaK5R7sWm6RrSkZuAQ==} @@ -6403,7 +6516,7 @@ packages: std-env: 3.7.0 strip-literal: 2.0.0 test-exclude: 6.0.0 - vitest: 1.6.0(@types/node@20.10.0) + vitest: 1.6.0(@types/node@20.10.0)(jsdom@23.2.0) transitivePeerDependencies: - supports-color dev: true @@ -6515,6 +6628,16 @@ packages: resolution: {integrity: sha512-CqTpxOlUCPWRNUPZDxT5v2NnHXA4oox612iUGnmTUGQFhZ1Gkj8kirtl/2wcF6MqX7+PqqicZzOCBKKfIn0dww==} dev: true + /@xenova/transformers@2.17.1: + resolution: {integrity: sha512-zo702tQAFZXhzeD2GCYUNUqeqkoueOdiSbQWa4s0q7ZE4z8WBIwIsMMPGobpgdqjQ2u0Qulo08wuqVEUrBXjkQ==} + dependencies: + '@huggingface/jinja': 0.2.2 + onnxruntime-web: 1.14.0 + sharp: 0.32.6 + optionalDependencies: + onnxruntime-node: 1.14.0 + dev: false + /@yarnpkg/esbuild-plugin-pnp@3.0.0-rc.15(esbuild@0.18.20): resolution: {integrity: sha512-kYzDJO5CA9sy+on/s2aIW0411AklfCi8Ck/4QDivOqsMKpStZA2SsR+X27VTggGwpStWaLrjJcDcdDMowtG8MA==} engines: {node: '>=14.15.0'} @@ -6806,7 +6929,6 @@ packages: /b4a@1.6.4: resolution: {integrity: sha512-fpWrvyVHEKyeEvbKZTVOeZF3VSKKWtJxFIxX/jaVPf+cLbGUSitjb49pHLqPV2BUNNZ0LcoeEGfE/YCpyDYHIw==} - dev: true /babel-core@7.0.0-bridge.0(@babel/core@7.23.9): resolution: {integrity: sha512-poPX9mZH/5CSanm50Q+1toVci6pv5KSRv/5TWCwtzQS5XEwn40BcCrgIeMFWP9CKKIniKXNxoIOnOq4VVlGXhg==} @@ -6894,6 +7016,11 @@ packages: open: 8.4.2 dev: true + /bidi-js@1.0.3: + resolution: {integrity: sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==} + dependencies: + require-from-string: 2.0.2 + /big-integer@1.6.51: resolution: {integrity: sha512-GPEid2Y9QU1Exl1rpO9B2IPJGHPSupF5GnVIP0blYvNOMer2bTvSWs1jGOUg04hTmu67nmLsQ9TBo1puaotBHg==} engines: {node: '>=0.6'} @@ -6914,7 +7041,6 @@ packages: buffer: 5.7.1 inherits: 2.0.4 readable-stream: 3.6.2 - dev: true /body-parser@1.20.1: resolution: {integrity: sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==} @@ -7022,7 +7148,13 @@ packages: dependencies: base64-js: 1.5.1 ieee754: 1.2.1 - dev: true + + /buffer@6.0.3: + resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==} + dependencies: + base64-js: 1.5.1 + ieee754: 1.2.1 + dev: false /bundle-name@4.1.0: resolution: {integrity: sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==} @@ -7208,7 +7340,6 @@ packages: /chownr@1.1.4: resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} requiresBuild: true - dev: true /chownr@2.0.0: resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} @@ -7342,7 +7473,6 @@ packages: engines: {node: '>=7.0.0'} dependencies: color-name: 1.1.4 - dev: true /color-name@1.1.3: resolution: {integrity: sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==} @@ -7350,7 +7480,21 @@ packages: /color-name@1.1.4: resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==} - dev: true + + /color-string@1.9.1: + resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} + dependencies: + color-name: 1.1.4 + simple-swizzle: 0.2.2 + dev: false + + /color@4.2.3: + resolution: {integrity: sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==} + engines: {node: '>=12.5.0'} + dependencies: + color-convert: 2.0.1 + color-string: 1.9.1 + dev: false /colord@2.9.3: resolution: {integrity: sha512-jeC1axXpnb0/2nn/Y1LPuLdgXBLH7aDcHu4KEKfqw3CUhX7ZpfBSlPKyqXE6btIgEzfWtrX3/tyBCaCvXvMkOw==} @@ -7564,7 +7708,6 @@ packages: dependencies: mdn-data: 2.0.30 source-map-js: 1.0.2 - dev: true /css-what@6.1.0: resolution: {integrity: sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==} @@ -7580,6 +7723,12 @@ packages: resolution: {integrity: sha512-FAaLDaplstoRsDR8XGYH51znUN0UY7nMc6Z9/fvE8EXGwvJE9hu7W2vHwx1+bd6gCYnln9nLbzxFTrcO9YQDZw==} dev: false + /cssstyle@4.0.1: + resolution: {integrity: sha512-8ZYiJ3A/3OkDd093CBT/0UKDWry7ak4BdPTFP2+QEP7cmhouyq/Up709ASSj2cK02BbZiMgk7kYjZNS4QP5qrQ==} + engines: {node: '>=18'} + dependencies: + rrweb-cssom: 0.6.0 + /csstype@3.1.0: resolution: {integrity: sha512-uX1KG+x9h5hIJsaKR9xHUeUraxf8IODOwq9JLNPq6BwB04a/xgpq3rcx47l5BZu5zBPlgD342tdke3Hom/nJRA==} @@ -7590,6 +7739,13 @@ packages: type: 1.2.0 dev: true + /data-urls@5.0.0: + resolution: {integrity: sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==} + engines: {node: '>=18'} + dependencies: + whatwg-mimetype: 4.0.0 + whatwg-url: 14.0.0 + /date-fns@2.30.0: resolution: {integrity: sha512-fnULvOpxnC5/Vg3NCiWelDsLiUc9bRwAPs/+LfTLNvetFCtCTN+yQz15C/fs4AwX1R9K5GLtLfn8QW+dWisaAw==} engines: {node: '>=0.11'} @@ -7653,6 +7809,9 @@ packages: engines: {node: '>=10'} dev: true + /decimal.js@10.4.3: + resolution: {integrity: sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA==} + /decode-named-character-reference@1.0.2: resolution: {integrity: sha512-O8x12RzrUF8xyVcY0KJowWsmaJxQbmy0/EtnNtHRpsOcT7dFk5W598coHqBVpmWo1oQQfsCqfCmkZN5DJrZVdg==} dependencies: @@ -7665,8 +7824,6 @@ packages: requiresBuild: true dependencies: mimic-response: 3.1.0 - dev: true - optional: true /deep-eql@4.1.3: resolution: {integrity: sha512-WaEtAOpRA1MQ0eohqZjpGD8zdI0Ovsm8mmFhaDN8dvDZzyoUMcYDnf5Y6iu7HTXxf8JDS23qWa4a+hKCDyOPzw==} @@ -7707,8 +7864,6 @@ packages: resolution: {integrity: sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==} engines: {node: '>=4.0.0'} requiresBuild: true - dev: true - optional: true /deepmerge@4.3.1: resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==} @@ -7819,8 +7974,11 @@ packages: resolution: {integrity: sha512-463v3ZeIrcWtdgIg6vI6XUncguvr2TnGl4SzDXinkt9mSLpBJKXT3mW6xT3VQdDN11+WVs29pgvivTc4Lp8v+w==} engines: {node: '>=8'} requiresBuild: true - dev: true - optional: true + + /detect-libc@2.0.3: + resolution: {integrity: sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw==} + engines: {node: '>=8'} + dev: false /detect-node-es@1.1.0: resolution: {integrity: sha512-ypdmJU/TbBby2Dxibuv7ZLW3Bs1QEmM7nHjEANfohJLvE0XVujisn1qPJcZxg+qDucsr+bP6fLD1rPS3AhJ7EQ==} @@ -7970,7 +8128,6 @@ packages: resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} dependencies: once: 1.4.0 - dev: true /entities@2.1.0: resolution: {integrity: sha512-hCx1oky9PFrJ611mf0ifBLBRW8lUUVRlFolb5gWRfIELabBlbp9xZvrqZLZAs+NxFnbfQoeGd8wDkygjg7U85w==} @@ -8335,8 +8492,6 @@ packages: resolution: {integrity: sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==} engines: {node: '>=6'} requiresBuild: true - dev: true - optional: true /express@4.18.2: resolution: {integrity: sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==} @@ -8417,7 +8572,6 @@ packages: /fast-fifo@1.3.2: resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} - dev: true /fast-glob@3.3.1: resolution: {integrity: sha512-kNFPyjhh5cKjrUltxs+wFx+ZkbRaxxmZ+X0ZU31SOsxCEtP9VPgtq2teZw1DebupL5GmDaNQ6yKMMVcM41iqDg==} @@ -8554,6 +8708,10 @@ packages: hasBin: true dev: true + /flatbuffers@1.12.0: + resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==} + dev: false + /flatted@3.3.1: resolution: {integrity: sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw==} dev: true @@ -8594,6 +8752,14 @@ packages: combined-stream: 1.0.8 mime-types: 2.1.35 + /form-data@4.0.0: + resolution: {integrity: sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==} + engines: {node: '>= 6'} + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + mime-types: 2.1.35 + /forwarded@0.2.0: resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==} engines: {node: '>= 0.6'} @@ -8609,7 +8775,6 @@ packages: /fs-constants@1.0.0: resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} requiresBuild: true - dev: true /fs-extra@10.1.0: resolution: {integrity: sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==} @@ -8762,8 +8927,6 @@ packages: /github-from-package@0.0.0: resolution: {integrity: sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==} requiresBuild: true - dev: true - optional: true /github-slugger@1.5.0: resolution: {integrity: sha512-wIh+gKBI9Nshz2o46B0B3f5k/W+WI9ZAv6y5Dn5WJ5SK1t0TnDimB4WE5rmTD05ZAIn8HALCZVmCsvj0w0v0lw==} @@ -8932,6 +9095,10 @@ packages: - supports-color dev: false + /guid-typescript@1.0.9: + resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} + dev: false + /gunzip-maybe@1.4.2: resolution: {integrity: sha512-4haO1M4mLO91PW57BMsDFf75UmwoRX0GkdD+Faw+Lr+r/OZrOCS0pIBwOL1xCKQqnQzbNFGgK2V2CpBUPeFNTw==} hasBin: true @@ -9106,6 +9273,12 @@ packages: lru-cache: 6.0.0 dev: true + /html-encoding-sniffer@4.0.0: + resolution: {integrity: sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==} + engines: {node: '>=18'} + dependencies: + whatwg-encoding: 3.1.1 + /html-escaper@2.0.0: resolution: {integrity: sha512-a4u9BeERWGu/S8JiWEAQcdrg9v4QArtP9keViQjGMdff20fBdd8waotXaNmODqBe6uZ3Nafi7K/ho4gCQHV3Ig==} dev: true @@ -9181,7 +9354,6 @@ packages: debug: 4.3.4(supports-color@8.1.1) transitivePeerDependencies: - supports-color - dev: true /https-proxy-agent@4.0.0: resolution: {integrity: sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg==} @@ -9228,10 +9400,15 @@ packages: dependencies: safer-buffer: 2.1.2 + /iconv-lite@0.6.3: + resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==} + engines: {node: '>=0.10.0'} + dependencies: + safer-buffer: 2.1.2 + /ieee754@1.2.1: resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==} requiresBuild: true - dev: true /ignore@5.2.4: resolution: {integrity: sha512-MAb38BcSbH0eHNBxn7ql2NH/kX33OkB3lZ1BNdh7ENeRChHTYsTvWrMubiIAMNS2llXEEgZ1MUOBtXChP3kaFQ==} @@ -9286,7 +9463,6 @@ packages: /ini@1.3.8: resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==} requiresBuild: true - dev: true /inline-style-parser@0.1.1: resolution: {integrity: sha512-7NXolsK4CAS5+xvdj5OMMbI962hU/wvwoxk+LWR9Ek9bVtyuuYScDN6eS0rUm6TxApFpw7CX1o4uJzcd4AyD3Q==} @@ -9364,6 +9540,10 @@ packages: resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==} dev: true + /is-arrayish@0.3.2: + resolution: {integrity: sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==} + dev: false + /is-bigint@1.0.1: resolution: {integrity: sha512-J0ELF4yHFxHy0cmSxZuheDOz2luOdVvqjwmEcj8H/L1JHeuEDSDbeRP+Dk9kFVk5RTFzbucJ2Kb9F7ixY2QaCg==} dev: false @@ -9533,6 +9713,9 @@ packages: resolution: {integrity: sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==} engines: {node: '>=0.10.0'} + /is-potential-custom-element-name@1.0.1: + resolution: {integrity: sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==} + /is-promise@2.2.2: resolution: {integrity: sha512-+lP4/6lKUBfQjZ2pdxThZvLUAafmZb8OAxFb8XXtiQmS35INgr85hdOGoEs124ez1FCnZJt6jau/T+alh58QFQ==} dev: true @@ -9824,6 +10007,41 @@ packages: - supports-color dev: true + /jsdom@23.2.0: + resolution: {integrity: sha512-L88oL7D/8ufIES+Zjz7v0aes+oBMh2Xnh3ygWvL0OaICOomKEPKuPnIfBJekiXr+BHbbMjrWn/xqrDQuxFTeyA==} + engines: {node: '>=18'} + peerDependencies: + canvas: ^2.11.2 + peerDependenciesMeta: + canvas: + optional: true + dependencies: + '@asamuzakjp/dom-selector': 2.0.2 + cssstyle: 4.0.1 + data-urls: 5.0.0 + decimal.js: 10.4.3 + form-data: 4.0.0 + html-encoding-sniffer: 4.0.0 + http-proxy-agent: 7.0.0 + https-proxy-agent: 7.0.2 + is-potential-custom-element-name: 1.0.1 + parse5: 7.1.2 + rrweb-cssom: 0.6.0 + saxes: 6.0.0 + symbol-tree: 3.2.4 + tough-cookie: 4.1.4 + w3c-xmlserializer: 5.0.0 + webidl-conversions: 7.0.0 + whatwg-encoding: 3.1.1 + whatwg-mimetype: 4.0.0 + whatwg-url: 14.0.0 + ws: 8.17.0 + xml-name-validator: 5.0.0 + transitivePeerDependencies: + - bufferutil + - supports-color + - utf-8-validate + /jsesc@0.5.0: resolution: {integrity: sha512-uZz5UnB7u4T9LvwmFqXii7pZSouaRPorGs5who1Ip7VO0wxanFvBL7GkM6dTHlgX+jhBApRetaWpnDabOeTcnA==} hasBin: true @@ -10128,6 +10346,14 @@ packages: is-unicode-supported: 0.1.0 dev: true + /long@4.0.0: + resolution: {integrity: sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==} + dev: false + + /long@5.2.3: + resolution: {integrity: sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==} + dev: false + /longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} dev: false @@ -10159,7 +10385,6 @@ packages: engines: {node: '>=10'} dependencies: yallist: 4.0.0 - dev: true /lru-queue@0.1.0: resolution: {integrity: sha512-BpdYkt9EvGl8OfWHDQPISVpcl5xZthb+XPsbELj5AQXxIC8IriDZIQYjBJPEm5rS420sjZ0TLEzRcq5KdBhYrQ==} @@ -10419,7 +10644,6 @@ packages: /mdn-data@2.0.30: resolution: {integrity: sha512-GaqWWShW4kv/G9IEucWScBx9G1/vsFZZJUO+tD26M8J8z3Kw5RDQjaoZe03YAClgeS/SWPOcb4nkFBTEi5DUEA==} - dev: true /mdurl@1.0.1: resolution: {integrity: sha512-/sKlQJCBYVY9Ers9hqzKou4H6V5UWc/M59TH2dvkt+84itfnq7uFOMLpOiOS4ujvHP4etln18fmIxA5R5fll0g==} @@ -10785,8 +11009,6 @@ packages: resolution: {integrity: sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==} engines: {node: '>=10'} requiresBuild: true - dev: true - optional: true /min-indent@1.0.1: resolution: {integrity: sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==} @@ -10830,7 +11052,6 @@ packages: /minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} - dev: true /minipass@3.3.5: resolution: {integrity: sha512-rQ/p+KfKBkeNwo04U15i+hOwoVBVmekmm/HcfTkTN2t9pbQKCMm4eN5gFeqgrrSp/kH/7BYYhTIHOxGqzbBPaA==} @@ -10861,7 +11082,6 @@ packages: /mkdirp-classic@0.5.3: resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==} - dev: true /mkdirp@0.5.6: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} @@ -10973,8 +11193,6 @@ packages: /napi-build-utils@1.0.2: resolution: {integrity: sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==} requiresBuild: true - dev: true - optional: true /negotiator@0.6.3: resolution: {integrity: sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==} @@ -10994,8 +11212,6 @@ packages: requiresBuild: true dependencies: semver: 7.5.4 - dev: true - optional: true /node-addon-api@4.3.0: resolution: {integrity: sha512-73sE9+3UaLYYFmDsFZnqCInzPyh3MqIwZO9cw58yIqAZhONrrabrYyYe3TuIqtIiOuTXVhsGau8hcrhhwSsDIQ==} @@ -11003,6 +11219,10 @@ packages: dev: true optional: true + /node-addon-api@6.1.0: + resolution: {integrity: sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==} + dev: false + /node-dir@0.1.17: resolution: {integrity: sha512-tmPX422rYgofd4epzrNoOXiE8XFZYOcCq1vD7MAXCDO+O+zndlA2ztdKKMa+EeuBG5tHETpr4ml4RGgpqDCCAg==} engines: {node: '>= 0.10.5'} @@ -11164,6 +11384,51 @@ packages: resolution: {integrity: sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ==} dev: true + /onnx-proto@4.0.4: + resolution: {integrity: sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==} + dependencies: + protobufjs: 6.11.4 + dev: false + + /onnxruntime-common@1.14.0: + resolution: {integrity: sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==} + dev: false + + /onnxruntime-common@1.16.3: + resolution: {integrity: sha512-ZZfFzEqBf6YIGwB9PtBLESHI53jMXA+/hn+ACVUbEfPuK2xI5vMGpLPn+idpwCmHsKJNRzRwqV12K+6TQj6tug==} + dev: false + + /onnxruntime-node@1.14.0: + resolution: {integrity: sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==} + os: [win32, darwin, linux] + requiresBuild: true + dependencies: + onnxruntime-common: 1.14.0 + dev: false + optional: true + + /onnxruntime-web@1.14.0: + resolution: {integrity: sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==} + dependencies: + flatbuffers: 1.12.0 + guid-typescript: 1.0.9 + long: 4.0.0 + onnx-proto: 4.0.4 + onnxruntime-common: 1.14.0 + platform: 1.3.6 + dev: false + + /onnxruntime-web@1.16.3: + resolution: {integrity: sha512-8O1xCG/RcNQNYYWvdiQJSNpncVg78OVOFeV6MYs/jx++/b12oje8gYUzKqz9wR/sXiX/8TCvdyHgEjj5gQGKUg==} + dependencies: + flatbuffers: 1.12.0 + guid-typescript: 1.0.9 + long: 5.2.3 + onnxruntime-common: 1.16.3 + platform: 1.3.6 + protobufjs: 7.3.0 + dev: false + /open@10.1.0: resolution: {integrity: sha512-mnkeQ1qP5Ue2wd+aivTD3NHd/lZ96Lu0jgf0pwktLPtx6cTZiH7tyeGRRHs0zX0rbrahXPnXlUnbeXyaBBuIaw==} engines: {node: '>=18'} @@ -11338,7 +11603,6 @@ packages: resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==} dependencies: entities: 4.5.0 - dev: true /parseurl@1.3.3: resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} @@ -11487,6 +11751,10 @@ packages: pathe: 1.1.2 dev: true + /platform@1.3.6: + resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} + dev: false + /playwright-core@1.39.0: resolution: {integrity: sha512-+k4pdZgs1qiM+OUkSjx96YiKsXsmb59evFoqv8SKO067qBA+Z2s/dCzJij/ZhdQcs2zlTAgRKfeiiLm8PQ2qvw==} engines: {node: '>=16'} @@ -11635,8 +11903,6 @@ packages: simple-get: 4.0.1 tar-fs: 2.1.1 tunnel-agent: 0.6.0 - dev: true - optional: true /prettier@2.8.1: resolution: {integrity: sha512-lqGoSJBQNJidqCHE80vqZJHWHRFoNYsSpP9AjFhlhi9ODCJA541svILes/+/1GM3VaL/abZi7cpFzOpdR9UPKg==} @@ -11709,6 +11975,45 @@ packages: resolution: {integrity: sha512-9t5qARVofg2xQqKtytzt+lZ4d1Qvj8t5B8fEwXK6qOfgRLgH/b13QlgEyDh033NOS31nXeFbYv7CLUDG1CeifQ==} dev: false + /protobufjs@6.11.4: + resolution: {integrity: sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==} + hasBin: true + requiresBuild: true + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.4 + '@protobufjs/eventemitter': 1.1.0 + '@protobufjs/fetch': 1.1.0 + '@protobufjs/float': 1.0.2 + '@protobufjs/inquire': 1.1.0 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.0 + '@types/long': 4.0.2 + '@types/node': 20.11.20 + long: 4.0.0 + dev: false + + /protobufjs@7.3.0: + resolution: {integrity: sha512-YWD03n3shzV9ImZRX3ccbjqLxj7NokGN0V/ESiBV5xWqrommYHYiihuIyavq03pWSGqlyvYUFmfoMKd+1rPA/g==} + engines: {node: '>=12.0.0'} + requiresBuild: true + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.4 + '@protobufjs/eventemitter': 1.1.0 + '@protobufjs/fetch': 1.1.0 + '@protobufjs/float': 1.0.2 + '@protobufjs/inquire': 1.1.0 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.0 + '@types/node': 20.11.20 + long: 5.2.3 + dev: false + /proxy-addr@2.0.7: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} @@ -11720,6 +12025,9 @@ packages: resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==} dev: true + /psl@1.9.0: + resolution: {integrity: sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==} + /pump@2.0.1: resolution: {integrity: sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==} dependencies: @@ -11732,7 +12040,6 @@ packages: dependencies: end-of-stream: 1.4.4 once: 1.4.0 - dev: true /pumpify@1.5.1: resolution: {integrity: sha512-oClZI37HvuUJJxSKKrC17bZ9Cu0ZYhEAGPsPUy9KlMUmv9dKX2o77RUmq7f3XjIxbwyGwYzbzQ1L2Ks8sIradQ==} @@ -11745,7 +12052,10 @@ packages: /punycode@2.3.0: resolution: {integrity: sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==} engines: {node: '>=6'} - dev: true + + /punycode@2.3.1: + resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==} + engines: {node: '>=6'} /puppeteer-core@2.1.1: resolution: {integrity: sha512-n13AWriBMPYxnpbb6bnaY5YoY6rGj8vPLrz6CZF3o0qJNEwlcfJVxBzYZ0NJsQ21UbdJoijPCDrM++SUVEz7+w==} @@ -11806,9 +12116,11 @@ packages: dependencies: side-channel: 1.0.4 + /querystringify@2.2.0: + resolution: {integrity: sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==} + /queue-tick@1.0.1: resolution: {integrity: sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==} - dev: true /quick-lru@5.1.1: resolution: {integrity: sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==} @@ -11847,8 +12159,6 @@ packages: ini: 1.3.8 minimist: 1.2.8 strip-json-comments: 2.0.1 - dev: true - optional: true /react-colorful@5.6.1(react-dom@18.2.0)(react@18.2.0): resolution: {integrity: sha512-1exovf0uGTGyq5mXQT0zgQ80uvj2PCwvF8zY1RN9/vbJVSjSo3fsB/4L3ObbF7u70NduSiK4xu4Y6q1MHoUGEw==} @@ -12078,7 +12388,6 @@ packages: inherits: 2.0.4 string_decoder: 1.3.0 util-deprecate: 1.0.2 - dev: true /readdirp@3.6.0: resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==} @@ -12260,7 +12569,9 @@ packages: /require-from-string@2.0.2: resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} engines: {node: '>=0.10.0'} - dev: true + + /requires-port@1.0.0: + resolution: {integrity: sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==} /resolve-from@4.0.0: resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==} @@ -12382,6 +12693,9 @@ packages: fsevents: 2.3.3 dev: false + /rrweb-cssom@0.6.0: + resolution: {integrity: sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==} + /run-applescript@7.0.0: resolution: {integrity: sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==} engines: {node: '>=18'} @@ -12419,6 +12733,12 @@ packages: resolution: {integrity: sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==} dev: true + /saxes@6.0.0: + resolution: {integrity: sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==} + engines: {node: '>=v12.22.7'} + dependencies: + xmlchars: 2.2.0 + /scheduler@0.23.0: resolution: {integrity: sha512-CtuThmgHNg7zIZWAXi3AsyIzA3n4xx7aNyjwC2VJldO2LMVDhFK+63xGqq6CsJH4rTAt6/M+N4GhZiDYPx9eUw==} dependencies: @@ -12445,7 +12765,6 @@ packages: hasBin: true dependencies: lru-cache: 6.0.0 - dev: true /send@0.18.0: resolution: {integrity: sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==} @@ -12524,6 +12843,21 @@ packages: kind-of: 6.0.3 dev: true + /sharp@0.32.6: + resolution: {integrity: sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==} + engines: {node: '>=14.15.0'} + requiresBuild: true + dependencies: + color: 4.2.3 + detect-libc: 2.0.3 + node-addon-api: 6.1.0 + prebuild-install: 7.1.1 + semver: 7.5.4 + simple-get: 4.0.1 + tar-fs: 3.0.4 + tunnel-agent: 0.6.0 + dev: false + /shebang-command@2.0.0: resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==} engines: {node: '>=8'} @@ -12568,8 +12902,6 @@ packages: /simple-concat@1.0.1: resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} requiresBuild: true - dev: true - optional: true /simple-get@4.0.1: resolution: {integrity: sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==} @@ -12578,8 +12910,12 @@ packages: decompress-response: 6.0.0 once: 1.4.0 simple-concat: 1.0.1 - dev: true - optional: true + + /simple-swizzle@0.2.2: + resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} + dependencies: + is-arrayish: 0.3.2 + dev: false /sirv@2.0.4: resolution: {integrity: sha512-94Bdh3cC2PKrbgSOUqTiGPWVZeSiXfKOVZNJniWoqrWrRkB1CJzBU3NEbiTsPcYy1lDsANA/THzS+9WBiy5nfQ==} @@ -12728,7 +13064,6 @@ packages: dependencies: fast-fifo: 1.3.2 queue-tick: 1.0.1 - dev: true /string-width@4.2.3: resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} @@ -12759,7 +13094,6 @@ packages: requiresBuild: true dependencies: safe-buffer: 5.2.1 - dev: true /stringify-entities@4.0.3: resolution: {integrity: sha512-BP9nNHMhhfcMbiuQKCqMjhDP5yBCAxsPu4pHFFzJ6Alo9dZgY4VLDPutXqIjpRiMoKdp7Av85Gr73Q5uH9k7+g==} @@ -12808,8 +13142,6 @@ packages: resolution: {integrity: sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==} engines: {node: '>=0.10.0'} requiresBuild: true - dev: true - optional: true /strip-json-comments@3.1.1: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} @@ -12971,6 +13303,9 @@ packages: - utf-8-validate dev: true + /symbol-tree@3.2.4: + resolution: {integrity: sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==} + /synchronous-promise@2.0.17: resolution: {integrity: sha512-AsS729u2RHUfEra9xJrE39peJcc2stq2+poBXX8bcM08Y6g9j/i/PUzwNQqkaJde7Ntg1TO7bSREbR5sdosQ+g==} dev: true @@ -13039,7 +13374,6 @@ packages: mkdirp-classic: 0.5.3 pump: 3.0.0 tar-stream: 2.2.0 - dev: true /tar-fs@3.0.4: resolution: {integrity: sha512-5AFQU8b9qLfZCX9zp2duONhPmZv0hGYiBPJsyUdqMjzq/mqVpy/rEUSeHk1+YitmxugaptgBh5oDGU3VsAJq4w==} @@ -13047,7 +13381,6 @@ packages: mkdirp-classic: 0.5.3 pump: 3.0.0 tar-stream: 3.1.6 - dev: true /tar-stream@2.2.0: resolution: {integrity: sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==} @@ -13059,7 +13392,6 @@ packages: fs-constants: 1.0.0 inherits: 2.0.4 readable-stream: 3.6.2 - dev: true /tar-stream@3.1.6: resolution: {integrity: sha512-B/UyjYwPpMBv+PaFSWAmtYjwdrlEaZQEhMIBFNC5oEG8lpiW8XjcSdmEaClj28ArfKScKHs2nshz3k2le6crsg==} @@ -13067,7 +13399,6 @@ packages: b4a: 1.6.4 fast-fifo: 1.3.2 streamx: 2.15.1 - dev: true /tar@6.1.13: resolution: {integrity: sha512-jdIBIN6LTIe2jqzay/2vtYLlBHa3JF42ot3h1dW8Q0PaAG4v8rm0cvpVePtau5C6OKXGGcgO9q2AMNSWxiLqKw==} @@ -13205,9 +13536,24 @@ packages: engines: {node: '>=6'} dev: false + /tough-cookie@4.1.4: + resolution: {integrity: sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag==} + engines: {node: '>=6'} + dependencies: + psl: 1.9.0 + punycode: 2.3.0 + universalify: 0.2.0 + url-parse: 1.5.10 + /tr46@0.0.3: resolution: {integrity: sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==} + /tr46@5.0.0: + resolution: {integrity: sha512-tk2G5R2KRwBd+ZN0zaEXpmzdKyOYksXwywulIX95MBODjSzMIuQnQ3m8JxgbhnL1LeVo7lqQKsYa1O3Htl7K5g==} + engines: {node: '>=18'} + dependencies: + punycode: 2.3.1 + /tree-kill@1.2.2: resolution: {integrity: sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==} hasBin: true @@ -13291,8 +13637,6 @@ packages: requiresBuild: true dependencies: safe-buffer: 5.2.1 - dev: true - optional: true /tunnel@0.0.6: resolution: {integrity: sha512-1h/Lnq9yajKY2PEbBadPXj3VxsDDu844OnaAo52UVmIzIvwwtBPIuNvkjuzBlTWpfJyUbG3ez0KSBibQkj4ojg==} @@ -13500,6 +13844,10 @@ packages: unist-util-visit-parents: 6.0.1 dev: false + /universalify@0.2.0: + resolution: {integrity: sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==} + engines: {node: '>= 4.0.0'} + /universalify@1.0.0: resolution: {integrity: sha512-rb6X1W158d7pRQBg5gkR8uPaSfiids68LTJQYOtEUhoJUWBdaQHsuT/EUduxXYxcrt4r5PJ4fuHW1MHT6p0qug==} engines: {node: '>= 10.0.0'} @@ -13559,6 +13907,12 @@ packages: resolution: {integrity: sha512-jk1+QP6ZJqyOiuEI9AEWQfju/nB2Pw466kbA0LEZljHwKeMgd9WrAEgEGxjPDD2+TNbbb37rTyhEfrCXfuKXnA==} dev: true + /url-parse@1.5.10: + resolution: {integrity: sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==} + dependencies: + querystringify: 2.2.0 + requires-port: 1.0.0 + /url-template@2.0.8: resolution: {integrity: sha512-XdVKMF4SJ0nP/O7XIPB0JwAEuT9lDIYnNsK8yGVe43y0AWoKeJNdv3ZNWh7ksJ6KqQFjOO6ox/VEitLnaVNufw==} dev: false @@ -13808,12 +14162,12 @@ packages: vitest: '>=0.16.0' dependencies: cross-fetch: 3.1.8 - vitest: 1.6.0(@types/node@20.10.0) + vitest: 1.6.0(@types/node@20.10.0)(jsdom@23.2.0) transitivePeerDependencies: - encoding dev: true - /vitest@1.6.0(@types/node@20.10.0): + /vitest@1.6.0(@types/node@20.10.0)(jsdom@23.2.0): resolution: {integrity: sha512-H5r/dN06swuFnzNFhq/dnz37bPXnq8xB2xB5JOVk8K09rUtoeNN+LHWkoQ0A/i3hvbUKKcCei9KpbxqHMLhLLA==} engines: {node: ^18.0.0 || >=20.0.0} hasBin: true @@ -13848,6 +14202,7 @@ packages: chai: 4.3.10 debug: 4.3.4(supports-color@8.1.1) execa: 8.0.1 + jsdom: 23.2.0 local-pkg: 0.5.0 magic-string: 0.30.7 pathe: 1.1.2 @@ -13876,6 +14231,12 @@ packages: /w3c-keyname@2.2.8: resolution: {integrity: sha512-dpojBhNsCNN7T82Tm7k26A6G9ML3NkhDsnw9n/eoxSRlVBB4CEtIQ/KTCLI2Fwf3ataSXRhYFkQi3SlnFwPvPQ==} + /w3c-xmlserializer@5.0.0: + resolution: {integrity: sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==} + engines: {node: '>=18'} + dependencies: + xml-name-validator: 5.0.0 + /walker@1.0.8: resolution: {integrity: sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==} dependencies: @@ -13903,6 +14264,10 @@ packages: /webidl-conversions@3.0.1: resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} + /webidl-conversions@7.0.0: + resolution: {integrity: sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==} + engines: {node: '>=12'} + /webpack-sources@3.2.3: resolution: {integrity: sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==} engines: {node: '>=10.13.0'} @@ -13912,6 +14277,23 @@ packages: resolution: {integrity: sha512-poXpCylU7ExuvZK8z+On3kX+S8o/2dQ/SVYueKA0D4WEMXROXgY8Ez50/bQEUmvoSMMrWcrJqCHuhAbsiwg7Dg==} dev: true + /whatwg-encoding@3.1.1: + resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} + engines: {node: '>=18'} + dependencies: + iconv-lite: 0.6.3 + + /whatwg-mimetype@4.0.0: + resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==} + engines: {node: '>=18'} + + /whatwg-url@14.0.0: + resolution: {integrity: sha512-1lfMEm2IEr7RIV+f4lUNPOqfFL+pO+Xw3fJSqmjX9AbXcXcYOkCe1P6+9VBZB6n94af16NfZf+sSk0JCBZC9aw==} + engines: {node: '>=18'} + dependencies: + tr46: 5.0.0 + webidl-conversions: 7.0.0 + /whatwg-url@5.0.0: resolution: {integrity: sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==} dependencies: @@ -14064,6 +14446,22 @@ packages: optional: true dev: true + /ws@8.17.0: + resolution: {integrity: sha512-uJq6108EgZMAl20KagGkzCKfMEjxmKvZHG7Tlq0Z6nOky7YF7aq4mOx6xK8TJ/i1LeK4Qus7INktacctDgY8Ow==} + engines: {node: '>=10.0.0'} + peerDependencies: + bufferutil: ^4.0.1 + utf-8-validate: '>=5.0.2' + peerDependenciesMeta: + bufferutil: + optional: true + utf-8-validate: + optional: true + + /xml-name-validator@5.0.0: + resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==} + engines: {node: '>=18'} + /xml2js@0.5.0: resolution: {integrity: sha512-drPFnkQJik/O+uPKpqSgr22mpuFHqKdbS835iAQrUC73L2F5WkboIRd63ai/2Yg6I1jzifPFKH2NTK+cfglkIA==} engines: {node: '>=4.0.0'} @@ -14077,6 +14475,9 @@ packages: engines: {node: '>=4.0'} dev: true + /xmlchars@2.2.0: + resolution: {integrity: sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==} + /xss@1.0.14: resolution: {integrity: sha512-og7TEJhXvn1a7kzZGQ7ETjdQVS2UfZyTlsEdDOqvQF7GoxNfY+0YLCzBy1kPdsDDx4QuNAonQPddpsn6Xl/7sw==} engines: {node: '>= 0.10.0'} @@ -14101,7 +14502,6 @@ packages: /yallist@4.0.0: resolution: {integrity: sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==} - dev: true /yaml@2.3.4: resolution: {integrity: sha512-8aAvwVUSHpfEqTQ4w/KMlf3HcRdt50E5ODIQJBw1fQ5RL34xabzxtUlzTXVqc4rkZsPbvrXKWnABCD7kWSmocA==} diff --git a/provider/docs/.gitignore b/provider/docs/.gitignore new file mode 100644 index 00000000..350414c9 --- /dev/null +++ b/provider/docs/.gitignore @@ -0,0 +1,2 @@ +# @xenova/transformers writes here when running the VS Code extension +/.cache/ diff --git a/provider/docs/README.md b/provider/docs/README.md new file mode 100644 index 00000000..9630818f --- /dev/null +++ b/provider/docs/README.md @@ -0,0 +1,75 @@ +¿# Docs context provider for OpenCtx + +This is a context provider for [OpenCtx](https://openctx.org) that adds contextual documentation to your code from an existing documentation corpus. + +## Screenshot + +![Screenshot of OpenCtx docs items]() + +_TODO(sqs)_ + +Visit the [OpenCtx playground](https://openctx.org/playground) for a live example. + +## Usage + +Add the following to your settings in any OpenCtx client: + +```json +"openctx.providers": { + // ...other providers... + "https://openctx.org/npm/@openctx/provider-docs": { + // TODO(sqs) + } +}, +``` + +TODO(sqs) + +See "[Configuration](#configuration)" for more. + +Tips: + +- If you're using VS Code, you can put the snippet above in `.vscode/settings.json` in the repository or workspace root to configure per-repository links. +- Play around with the docs provider in realtime on the [OpenCtx playground](https://openctx.org/playground). + +## Configuration + + + +```typescript +/** Settings for the docs OpenCtx provider. */ +export interface Settings { + // TODO(sqs) +} +``` + +## Design + +### Concepts + +- Corpus: a set of documents, such as a documentation site for a library. +- Archive: a raw dump of the contents of a corpus, such as the full HTML content of all web pages on a documentation site. +- Index: a file containing pre-computed embeddings and full-text search indexes for all documents in an archive. + +## Indexing a documentation corpus + +- Create the corpus archive +- Index the corpus --> the index file is what's + +``` +pnpm -C provider/docs run -s create-archive web '{"entryPage": "https://vitejs.dev/guide", "prefix": "https://vitejs.dev/guide", "ignore":[]}' > ~/tmp/octx-provider-docs/vite-docs-web-corpus.json +pnpm -C provider/docs run -s create-index < ~/tmp/octx-provider-docs/vite-docs-web-corpus.json > ~/tmp/octx-provider-docs/vite-docs-web-index.json + +######### OLD below + +time p -C provider/docs run -s docs-query 'redirect' $(find ~/src/github.com/vikejs/vike/docs/pages -type f) +time p -C provider/docs run -s docs-query 'making provider work in vscode' $(find ../../web/content/docs -type f) +p -C provider/docs run -s create-web-corpus https://docs.sourcegraph.com https://docs.sourcegraph.com https://docs.sourcegraph.com/@ cli/references .json .svg CHANGELOG > ~/tmp/octx-provider-docs/sourcegraph-docs-old-web-corpus.json +``` + +## Development + +- [Source code](https://sourcegraph.com/github.com/sourcegraph/openctx/-/tree/provider/docs) +- [Docs](https://openctx.org/docs/providers/docs) +- [Roadmap](https://github.com/sourcegraph/openctx/issues/11) +- License: Apache 2.0 diff --git a/provider/docs/bin/create-archive.ts b/provider/docs/bin/create-archive.ts new file mode 100644 index 00000000..69fd2308 --- /dev/null +++ b/provider/docs/bin/create-archive.ts @@ -0,0 +1,95 @@ +import path from 'path' +import { readFile } from 'fs/promises' +import { type CorpusArchive, createCorpusArchive } from '../src/corpus/archive/corpusArchive.ts' +import { + type WebCorpusArchiveOptions, + createWebCorpusArchive, +} from '../src/corpus/archive/web/webCorpusArchive.ts' +import type { Doc } from '../src/corpus/doc/doc.ts' + +type ArchiveKind = 'web' | 'file' +const ARCHIVE_KINDS: Record< + ArchiveKind, + { + optionsHelp: string + toOptions?: (value: any) => unknown + createFn: (options: any) => Promise + } +> = { + web: { + optionsHelp: JSON.stringify({ + entryPage: new URL('https://docs.example.com'), + prefix: new URL('https://docs.example.com'), + ignore: ['.svg', '/old/'], + } as WebCorpusArchiveOptions), + toOptions: (value: any) => + ({ + ...value, + entryPage: new URL(value.entryPage), + prefix: new URL(value.prefix), + logger: message => console.error('# ' + message), + }) satisfies WebCorpusArchiveOptions, + createFn: createWebCorpusArchive, + }, + file: { + optionsHelp: JSON.stringify(['/path/to/file1.txt', '/path/to/file2.md']), + createFn: async (files: string[]) => + createCorpusArchive( + await Promise.all( + files.map(async (file, i) => { + const data = await readFile(file, 'utf8') + return { + id: i + 1, + text: data, + } satisfies Doc + }) + ) + ), + }, +} + +function usage(): void { + console.error() + console.error('Usage:') + console.error() + for (const kind of Object.keys(ARCHIVE_KINDS).toSorted() as ArchiveKind[]) { + console.error( + ` ${path.basename(process.argv[1])} ${kind} '${ARCHIVE_KINDS[kind].optionsHelp}'` + ) + } +} + +const args = process.argv.slice(2) +const kind = args.at(0) as ArchiveKind | undefined +const optionsText = args.at(1) +let optionsRaw: any +try { + optionsRaw = JSON.parse(optionsText ?? '') +} catch (error) { + console.error('Error parsing options JSON:', error) + usage() + process.exit(1) +} +if (!kind || !ARCHIVE_KINDS[kind]) { + console.error( + `Unrecognized archive kind: ${kind} (valid values are: ${Object.keys(ARCHIVE_KINDS).join(', ')})` + ) + usage() + process.exit(1) +} + +const archiveHandler = ARCHIVE_KINDS[kind] +const options = archiveHandler.toOptions ? archiveHandler.toOptions(optionsRaw) : optionsRaw +const t0 = performance.now() +const archive = await archiveHandler.createFn(options) +const data = JSON.stringify(archive) +console.error( + `# Archive complete [${Math.round(performance.now() - t0)}ms]: ${archive.docs.length} docs (${( + data.length / + 1024 / + 1024 + ).toFixed(1)} MB), content ID: ${archive.contentID}, description ${JSON.stringify( + archive.description + )}` +) +process.stdout.write(data) diff --git a/provider/docs/bin/create-index.ts b/provider/docs/bin/create-index.ts new file mode 100644 index 00000000..8af7b97f --- /dev/null +++ b/provider/docs/bin/create-index.ts @@ -0,0 +1,60 @@ +import path from 'path' +import type { CorpusArchive } from '../src/corpus/archive/corpusArchive.ts' +import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor.ts' +import { createCorpusIndex } from '../src/corpus/index/corpusIndex.ts' + +function usage(): void { + console.error() + console.error(`Usage: ${path.basename(process.argv[1])} < /path/to/archive.json`) + console.error() + console.error('Note: Use the `create-archive` script to create the archive.json file.') + process.exit(1) +} + +const args = process.argv.slice(2) +if (args.length !== 0) { + console.error('Error: invalid arguments') + usage() +} + +const archive = (await readJSONFromStdin()) as CorpusArchive +console.error( + `# Using archive: ${archive.docs.length} docs, content ID ${ + archive.contentID + }, description ${JSON.stringify(archive.description)}` +) + +const t0 = performance.now() +const index = await createCorpusIndex(archive, { + contentExtractor: extractContentUsingMozillaReadability, +}) +const data = JSON.stringify(index) +console.error( + `# Index complete [${Math.round(performance.now() - t0)}ms]: ${index.docs.length} docs (${( + data.length / + 1024 / + 1024 + ).toFixed(1)} MB)` +) +process.stdout.write(data) + +function readJSONFromStdin(): Promise { + return new Promise((resolve, reject) => { + const data: string[] = [] + process.stdin.on('data', chunk => { + data.push(chunk.toString('utf8')) + }) + process.stdin.once('end', () => { + try { + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + const json = JSON.parse(data.join('')) + resolve(json) + } catch (error) { + reject(error) + } + }) + process.stdin.once('error', error => { + reject(error) + }) + }) +} diff --git a/provider/docs/bin/search.ts b/provider/docs/bin/search.ts new file mode 100644 index 00000000..6bbff35f --- /dev/null +++ b/provider/docs/bin/search.ts @@ -0,0 +1,70 @@ +import path from 'path' +import { readFile } from 'fs/promises' +import { createClient } from '../src/client/client.ts' +import { fromJSON } from '../src/corpus/index/corpusIndex.ts' + +const args = process.argv.slice(2) + +const indexFile = args[0] +const query = args[1] + +const USAGE = `\nUsage: ${path.basename(process.argv[1])} ` +if (!indexFile) { + console.error('Error: no index file specified (use the `create-index` script to create one)') + console.error(USAGE) + process.exit(1) +} +if (!query) { + console.error('Error: no query specified') + console.error(USAGE) + process.exit(1) +} +if (args.length !== 2) { + console.error('Error: invalid arguments') + console.error(USAGE) + process.exit(1) +} + +const index = fromJSON(JSON.parse(await readFile(indexFile, 'utf8'))) + +const client = createClient(index, { logger: message => console.error('# ' + message) }) + +const results = await client.search({ text: query }) +const MAX_RESULTS = 5 +const SHOW_EXCERPT = false +const SHOW_SCORES = true +console.error( + `# ${results.length} results${results.length > MAX_RESULTS ? ` (showing top ${MAX_RESULTS})` : ''}` +) +for (const [i, result] of results.slice(0, MAX_RESULTS).entries()) { + const doc = client.doc(result.doc) + if (i !== 0) { + console.log() + } + console.log( + `#${i + 1} [${result.score.toFixed(3)}] ${doc.doc.url ?? ''} doc${doc.doc.id}#chunk${ + result.chunk + }` + ) + const chunk = doc.chunks[result.chunk] + if (SHOW_EXCERPT) { + console.log(`${indent(truncate(chunk.text.replaceAll('\n\n', '\n'), 500), '\t')}`) + } + if (SHOW_SCORES) { + console.log(`\tscores: ${JSON.stringify(result.scores)}`) + } +} + +function truncate(text: string, maxLength: number): string { + if (text.length > maxLength) { + return text.slice(0, maxLength) + '...' + } + return text +} + +function indent(text: string, indent: string): string { + if (text === '') { + return '' + } + return indent + text.replaceAll('\n', '\n' + indent) +} diff --git a/provider/docs/package.json b/provider/docs/package.json new file mode 100644 index 00000000..dd988144 --- /dev/null +++ b/provider/docs/package.json @@ -0,0 +1,40 @@ +{ + "name": "@openctx/provider-docs", + "version": "0.0.1", + "description": "Context from any website (OpenCtx provider)", + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "https://github.com/sourcegraph/openctx", + "directory": "provider/docs" + }, + "type": "module", + "main": "dist/provider.mjs", + "types": "dist/src/provider/provider.d.ts", + "files": ["dist/provider.mjs", "dist/src/provider/provider.d.ts"], + "sideEffects": false, + "scripts": { + "bundle": "tsc --build && pnpm run -s bundle:esm && pnpm run -s bundle:cjs", + "bundle:esm": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.mjs --platform=node --format=esm --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:import.meta.url=import_meta_url --inject:src/polyfill1.js", + "bundle:cjs": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.cjs --platform=node --format=cjs --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:import.meta.url=import_meta_url --inject:src/polyfill1.js", + "prepublishOnly": "tsc --build --clean && npm run --silent bundle", + "test": "vitest", + "create-archive": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/create-archive.ts", + "create-index": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/create-index.ts", + "search": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/search.ts" + }, + "dependencies": { + "@mozilla/readability": "^0.5.0", + "@openctx/provider": "workspace:*", + "@xenova/transformers": "^2.13.4", + "buffer": "^6.0.3", + "jsdom": "^23.2.0", + "lru-cache": "^10.1.0", + "onnxruntime-web": "1.16" + }, + "devDependencies": { + "@types/jsdom": "^21.1.6", + "esbuild": "^0.19.11", + "vitest-fetch-mock": "^0.2.2" + } +} diff --git a/provider/docs/src/client/client.ts b/provider/docs/src/client/client.ts new file mode 100644 index 00000000..86be9d58 --- /dev/null +++ b/provider/docs/src/client/client.ts @@ -0,0 +1,45 @@ +import type { DocID } from '../corpus/doc/doc.ts' +import type { CorpusIndex, IndexedDoc } from '../corpus/index/corpusIndex.ts' +import type { Logger } from '../logger.ts' +import type { Query, SearchResult } from '../search/types.ts' +import { search } from './search.ts' + +/** + * A client for searching a {@link CorpusIndex}. + */ +export interface Client { + /** Search the corpus. */ + search(query: Query): Promise + + /** Get a document by docID. An exception is thrown if no such document exists. */ + doc(id: DocID): IndexedDoc + + /** All documents. */ + docs: readonly IndexedDoc[] +} + +export interface ClientOptions { + /** + * Called to print log messages. + */ + logger?: Logger +} + +/** + * Create a client for searching a {@link CorpusIndex}. + */ +export function createClient(index: CorpusIndex, options: ClientOptions = {}): Client { + return { + search(query) { + return search(index, query, { logger: options.logger }) + }, + doc(id) { + const doc = index.docs.find(d => d.doc.id === id) + if (!doc) { + throw new Error(`no document with id ${id} in corpus`) + } + return doc + }, + docs: index.docs, + } +} diff --git a/provider/docs/src/client/search.ts b/provider/docs/src/client/search.ts new file mode 100644 index 00000000..f65305e4 --- /dev/null +++ b/provider/docs/src/client/search.ts @@ -0,0 +1,76 @@ +import type { ChunkIndex } from '../corpus/doc/chunks.ts' +import type { DocID } from '../corpus/doc/doc.ts' +import type { CorpusIndex } from '../corpus/index/corpusIndex.ts' +import type { Logger } from '../logger.ts' +import { embeddingsSearch } from '../search/embeddings.ts' +import { keywordSearch } from '../search/keyword.ts' +import type { Query, SearchResult } from '../search/types.ts' + +export interface SearchOptions { + logger?: Logger +} + +/** + * Search using multiple search methods. + */ +export async function search( + index: CorpusIndex, + query: Query, + { logger }: SearchOptions +): Promise { + const allResults = await Promise.all( + Object.entries(SEARCH_METHODS).map(async ([name, searchFn]) => { + const t0 = performance.now() + const results = await searchFn(index, query) + logger?.(`search[${name}] took ${Math.round(performance.now() - t0)}ms`) + return [name, results] as [string, SearchResult[]] + }) + ) + + // Sum scores for each chunk. + const combinedResults = new Map>() + for (const [searchMethod, results] of allResults) { + for (const result of results) { + let docResults = combinedResults.get(result.doc) + if (!docResults) { + docResults = new Map() + combinedResults.set(result.doc, docResults) + } + + const chunkResult: SearchResult = docResults.get(result.chunk) ?? { + doc: result.doc, + chunk: result.chunk, + score: 0, + scores: {}, + excerpt: result.excerpt, + } + + // HACK: TF-IDF scores are lower than embeddings scores, so boost. + const scoreBoostFactor = searchMethod === 'keywordSearch' ? 4 : 1 + const adjustedScore = result.score * scoreBoostFactor + + docResults.set(result.chunk, { + ...chunkResult, + score: chunkResult.score + adjustedScore, + scores: { ...chunkResult.scores, [searchMethod]: adjustedScore }, + }) + } + } + + const results = Array.from(combinedResults.values()).flatMap(docResults => + Array.from(docResults.values()) + ) + const MIN_SCORE = 0.001 + return results.filter(s => s.score >= MIN_SCORE).toSorted((a, b) => b.score - a.score) +} + +const SEARCH_METHODS: Record< + string, + ( + index: CorpusIndex, + query: Query + ) => Omit[] | Promise[]> +> = { + keywordSearch, + embeddingsSearch, +} diff --git a/provider/docs/src/corpus/archive/corpusArchive.ts b/provider/docs/src/corpus/archive/corpusArchive.ts new file mode 100644 index 00000000..72e24b7a --- /dev/null +++ b/provider/docs/src/corpus/archive/corpusArchive.ts @@ -0,0 +1,33 @@ +import { contentID } from '../cache/contentID.ts' +import type { Doc } from '../doc/doc.ts' + +export interface CorpusArchive { + /** + * A human-readable description of the corpus, often including the command used to create the + * corpus archive. + */ + description: string + + /** The contents of all documents in the corpus. */ + docs: Doc[] + + /** The SHA-256 hash of the content of the documents. */ + contentID: string +} + +export async function createCorpusArchive(docs: Doc[], description = ''): Promise { + const seenIDs = new Set() + for (const doc of docs) { + if (doc.text.length === 0) { + throw new Error(`empty doc: ${doc.id}${doc.url ? ` (${doc.url})` : ''}`) + } + + if (seenIDs.has(doc.id)) { + throw new Error(`duplicate doc ID: ${doc.id}`) + } + seenIDs.add(doc.id) + } + + const fullContent = docs.map(doc => doc.text).join('\0') + return { description, docs, contentID: await contentID(fullContent) } +} diff --git a/provider/docs/src/corpus/archive/web/crawlQueue.ts b/provider/docs/src/corpus/archive/web/crawlQueue.ts new file mode 100644 index 00000000..5b65c55f --- /dev/null +++ b/provider/docs/src/corpus/archive/web/crawlQueue.ts @@ -0,0 +1,45 @@ +export interface CrawlQueue { + nextURL: () => URL | undefined + enqueueURL: (url: URL) => void + shouldCrawlURL: (url: URL) => boolean +} + +export function createCrawlQueue(isURLInScope: (url: URL) => boolean): CrawlQueue { + const seen = new Set() + const queue: URL[] = [] + + /** Normalize the URL to ignore the query string and hash. */ + function normalizeURLForSeen(url: URL): string { + return url.origin + url.pathname + } + + function seenURL(url: URL): boolean { + return seen.has(normalizeURLForSeen(url)) + } + function addToSeen(url: URL): void { + seen.add(normalizeURLForSeen(url)) + } + + const crawlQueue: CrawlQueue = { + nextURL() { + while (queue.length > 0) { + const url = queue.shift()! + if (!seenURL(url)) { + addToSeen(url) + return url + } + } + return undefined + }, + enqueueURL(url) { + if (crawlQueue.shouldCrawlURL(url)) { + queue.push(url) + } + }, + shouldCrawlURL(url) { + return !seenURL(url) && isURLInScope(url) + }, + } + + return crawlQueue +} diff --git a/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts b/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts new file mode 100644 index 00000000..24f26d68 --- /dev/null +++ b/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts @@ -0,0 +1,128 @@ +import { afterAll, afterEach, beforeAll, describe, expect, test, vi } from 'vitest' +import createFetchMock from 'vitest-fetch-mock' +import type { Doc } from '../../doc/doc.ts' +import { createWebCorpusArchive, urlHasPrefix } from './webCorpusArchive.ts' + +describe('createWebCorpusSource', () => { + const fetchMocker = createFetchMock(vi) + beforeAll(() => fetchMocker.enableMocks()) + afterEach(() => fetchMocker.resetMocks()) + afterAll(() => fetchMocker.disableMocks()) + + test('crawls', async () => { + type MockResponseInit = Parameters[1] & { body: string } + const mockPages: { [pathname: string]: MockResponseInit } = { + '/docs': { + url: 'https://example.com/docs/entry', // redirected + body: 'Redirected', + }, + '/docs/entry': { + body: ` +

Docs

+

See foo and bar.

+

See also x. +Docs home +`, + }, + '/docs/foo': { + body: ` +

Foo

+

See foo/a.

+`, + }, + '/docs/foo/a': { + body: ` +

Foo/a

+

See foo and docs.

+`, + }, + '/docs/bar': { + body: ` +

Bar

+

See bar/a.

+`, + }, + '/docs/bar/a': { + status: 404, + body: 'Not Found', + }, + } + fetchMocker.mockResponse(req => { + const url = new URL(req.url) + if (url.protocol !== 'https:' || url.host !== 'example.com') { + throw new Error(`not mocked: ${req.url}`) + } + const resp = mockPages[url.pathname] + if (resp) { + return { url: url.toString(), ...resp } + } + throw new Error(`not mocked: ${req.url}`) + }) + + const archive = await createWebCorpusArchive({ + entryPage: new URL('https://example.com/docs/entry'), + prefix: new URL('https://example.com/docs'), + }) + expect(archive.docs).toEqual([ + { id: 1, text: mockPages['/docs/entry'].body, url: 'https://example.com/docs/entry' }, + { id: 2, text: mockPages['/docs/foo'].body, url: 'https://example.com/docs/foo' }, + { id: 3, text: mockPages['/docs/bar'].body, url: 'https://example.com/docs/bar' }, + { id: 4, text: mockPages['/docs/foo/a'].body, url: 'https://example.com/docs/foo/a' }, + ]) + }) + + test('respects canonical URLs', async () => { + fetchMocker.mockResponse( + ` + + + + + + no trailing slash + trailing slash + with querystring + + `, + { url: 'https://example.com/a/' } + ) + const archive = await createWebCorpusArchive({ + entryPage: new URL('https://example.com/a/'), + prefix: new URL('https://example.com'), + }) + expect(archive.docs).toMatchObject[]>([ + { id: 1, url: 'https://example.com/a' }, + ]) + }) +}) + +describe('urlHasPrefix', () => { + test('same url', () => + expect( + urlHasPrefix(new URL('https://example.com/a/b'), new URL('https://example.com/a/b')) + ).toBe(true)) + + test('path prefix', () => { + expect(urlHasPrefix(new URL('https://example.com/a/b'), new URL('https://example.com/a'))).toBe( + true + ) + expect(urlHasPrefix(new URL('https://example.com/a/b'), new URL('https://example.com/a/'))).toBe( + true + ) + expect(urlHasPrefix(new URL('https://example.com/a'), new URL('https://example.com/a/'))).toBe( + true + ) + expect(urlHasPrefix(new URL('https://example.com/a-b'), new URL('https://example.com/a'))).toBe( + false + ) + }) + + test('query', () => { + expect( + urlHasPrefix(new URL('https://example.com/a?page=b'), new URL('https://example.com/a')) + ).toBe(true) + expect( + urlHasPrefix(new URL('https://example.com/a/b?page=c'), new URL('https://example.com/a')) + ).toBe(true) + }) +}) diff --git a/provider/docs/src/corpus/archive/web/webCorpusArchive.ts b/provider/docs/src/corpus/archive/web/webCorpusArchive.ts new file mode 100644 index 00000000..f2e40c9f --- /dev/null +++ b/provider/docs/src/corpus/archive/web/webCorpusArchive.ts @@ -0,0 +1,140 @@ +import { parseDOM } from '../../../dom.ts' +import type { Logger } from '../../../logger.ts' +import type { Doc } from '../../doc/doc.ts' +import { type CorpusArchive, createCorpusArchive } from '../corpusArchive.ts' +import { createCrawlQueue } from './crawlQueue.ts' + +export interface WebCorpusArchiveOptions { + /** + * Start crawling from this page. + */ + entryPage: URL + + /** + * Only include pages whose URL starts with this prefix. + */ + prefix: URL + + /** + * Exclude pages whose URL contains any of these strings. + */ + ignore?: string[] + + /** + * Called to print log messages. + */ + logger?: Logger +} + +export async function createWebCorpusArchive({ + entryPage, + prefix, + ignore, + logger, +}: WebCorpusArchiveOptions): Promise { + async function getDocs(): Promise { + const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue( + url => urlHasPrefix(url, prefix) && !ignore?.some(ignore => url.href.includes(ignore)) + ) + + if (!shouldCrawlURL(entryPage)) { + throw new Error(`web corpus entryPage (${entryPage}) does not start with prefix (${prefix})`) + } + + enqueueURL(entryPage) + + const documents: Doc[] = [] + + let url: URL | undefined + // biome-ignore lint/suspicious/noAssignInExpressions: + while ((url = nextURL())) { + logger?.(`Crawling URL: ${url.href}`) + + const resp = await fetch(url.href) + logger?.(`- Response: ${resp.status} ${resp.statusText}`) + if (!resp.ok) { + continue + } + + // Handle redirects. + if (resp.redirected || resp.url !== url.href) { + logger?.(`- Got redirect (redirected=${resp.redirected}, resp.url=${resp.url})`) + const wasRedirectedFromEntryPage = entryPage.href === url.href + url = new URL(resp.url) + if (!shouldCrawlURL(url) && !wasRedirectedFromEntryPage) { + logger?.(`- Skipping redirect destination URL: ${url}`) + continue + } + logger?.(`- Continuing with redirect destination URL: ${url}`) + } + + const html = await resp.text() + const dom = await parseDOM(html, resp.url) + + const canonicalURLStr = dom.querySelector( + "head > link[rel='canonical']" + )?.href + if (canonicalURLStr && canonicalURLStr !== url.href) { + const canonicalURL = parseURL(canonicalURLStr) + if (canonicalURL) { + // Only trust the canonical URL if it's same-origin, to avoid letting other + // sites pollute this corpus. + if (canonicalURL.origin === url.origin) { + logger?.(`- Found canonical URL: ${canonicalURL}`) + url = canonicalURL + if (!shouldCrawlURL(url)) { + continue + } + } + } + } + + documents.push({ + id: documents.length + 1, + text: html, + url: url.toString(), + }) + + const pageLinks = dom.querySelectorAll('a[href]') + logger?.(`- Found ${pageLinks.length} links on page`) + for (const link of pageLinks) { + const linkURL = parseURL(link.href) + if (linkURL) { + enqueueURL(linkURL) + } + } + } + + return documents + } + + const docs = await getDocs() + return createCorpusArchive( + docs, + `createWebCorpusArchive with options ${JSON.stringify({ entryPage, prefix, ignore })}` + ) +} + +function parseURL(urlStr: string): URL | undefined { + try { + return new URL(urlStr) + } catch { + return undefined + } +} + +export function urlHasPrefix(url: URL, prefix: URL): boolean { + // Disallow username and password. + if (url.username || url.password) { + return false + } + return ( + url.protocol === prefix.protocol && + url.host === prefix.host && + (url.pathname === prefix.pathname || + url.pathname.startsWith( + prefix.pathname.endsWith('/') ? prefix.pathname : prefix.pathname + '/' + ) || + (prefix.pathname.endsWith('/') && url.pathname === prefix.pathname.slice(0, -1))) + ) +} diff --git a/provider/docs/src/corpus/cache/contentID.test.ts b/provider/docs/src/corpus/cache/contentID.test.ts new file mode 100644 index 00000000..56f913b7 --- /dev/null +++ b/provider/docs/src/corpus/cache/contentID.test.ts @@ -0,0 +1,10 @@ +import { describe, expect, test } from 'vitest' +import { contentID } from './contentID.ts' + +describe('contentID', () => { + test('returns the content ID', async () => { + expect(await contentID('abc')).toBe( + 'ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad' + ) + }) +}) diff --git a/provider/docs/src/corpus/cache/contentID.ts b/provider/docs/src/corpus/cache/contentID.ts new file mode 100644 index 00000000..b2a8fc3f --- /dev/null +++ b/provider/docs/src/corpus/cache/contentID.ts @@ -0,0 +1,19 @@ +import type { webcrypto } from 'crypto' + +/** + * A unique identifier for a document's or chunk's content (based on a hash of the text). + */ +export type ContentID = string + +export async function contentID(text: string): Promise { + /// ///// console.count('contentID') + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access + const crypto: webcrypto.Crypto = + (globalThis as any).crypto || (await import('node:crypto')).default.webcrypto + + return Array.from( + new Uint8Array(await crypto.subtle.digest('SHA-256', new TextEncoder().encode(text))) + ) + .map(b => b.toString(16).padStart(2, '0')) + .join('') +} diff --git a/provider/docs/src/corpus/doc/chunks.test.ts b/provider/docs/src/corpus/doc/chunks.test.ts new file mode 100644 index 00000000..910f267a --- /dev/null +++ b/provider/docs/src/corpus/doc/chunks.test.ts @@ -0,0 +1,79 @@ +import { describe, expect, test } from 'vitest' +import { type Chunk, chunk } from './chunks.ts' + +describe('chunker', () => { + test('empty', () => expect(chunk('', {})).toEqual([])) + + test('fallback', () => + expect(chunk('a', {})).toEqual([{ range: { start: 0, end: 1 }, text: 'a' }])) + + describe('Default', () => { + test('single chunk', () => + expect(chunk('ab', {})).toEqual([ + { + range: { start: 0, end: 2 }, + text: 'ab', + }, + ])) + + test('multiple chunks', () => + expect(chunk('a\n\nb\n\n\tc\n\nd\n', { isTargetDoc: true })).toEqual([ + { + range: { start: 0, end: 1 }, + text: 'a', + }, + { + range: { start: 3, end: 8 }, + text: 'b\n\n\tc', + }, + { + range: { start: 10, end: 12 }, + text: 'd', + }, + ])) + }) + + describe('Markdown', () => { + test('by section', () => + expect( + chunk( + ` +# Title + +Intro + +## Section 1 + +Body 1 + +## Section 2 + +Body 2 +`.trim(), + { isMarkdown: true } + ) + ).toEqual([ + { + range: { + start: 2, + end: 16, + }, + text: 'Title\n\nIntro', + }, + { + range: { + start: 5, + end: 24, + }, + text: 'Section 1\n\nBody 1', + }, + { + range: { + start: 8, + end: 25, + }, + text: 'Section 2\n\nBody 2', + }, + ])) + }) +}) diff --git a/provider/docs/src/corpus/doc/chunks.ts b/provider/docs/src/corpus/doc/chunks.ts new file mode 100644 index 00000000..1fa75c26 --- /dev/null +++ b/provider/docs/src/corpus/doc/chunks.ts @@ -0,0 +1,94 @@ +/** + * Index of a {@link Chunk} in a {@link StoredDocument}. + */ +export type ChunkIndex = number + +export interface Chunk { + /** + * The text of the chunk, stripped of semantically meaningless markup, punctuation, and content. + * This text need not be present in the original document. + */ + text: string + + /** + * The range in the original document (as character offsets) represented by this chunk. + */ + range: { start: number; end: number } +} + +/** + * Information about the document to help the chunker know how to split the content into logical + * chunks. + */ +export interface ChunkerHints { + isMarkdown?: boolean + isTargetDoc?: boolean +} + +/** + * Split text into logical chunks (such as sections in a Markdown document). + */ +export function chunk(text: string, hints: ChunkerHints): Chunk[] { + if (hints.isMarkdown) { + return chunkMarkdown(text) + } + if (text.length === 0) { + return [] + } + if (hints.isTargetDoc) { + return chunkBySeparator(text, /(?:\r?\n){2,}(?=\S)/, text => text.trim()).filter( + chunk => !chunk.text.startsWith('import ') + ) + } + return [{ text, range: { start: 0, end: text.length } }] +} + +function chunkBySeparator( + text: string, + separator: RegExp, + transform?: (text: string) => string +): Chunk[] { + const chunks: Chunk[] = [] + const parts = text.split(new RegExp(`(${separator.source})`, separator.flags)) + let lastSep: string | undefined + for (const [i, part] of parts.entries()) { + const isSep = i % 2 === 1 + if (isSep) { + lastSep = part + } else { + const lastChunkEnd = chunks.at(-1)?.range.end ?? 0 + const lastSepLength = lastSep?.length ?? 0 + const text = (lastSep ?? '') + part + chunks.push({ + text: transform ? transform(text) : text, + range: { + start: lastChunkEnd + lastSepLength, + end: lastChunkEnd + lastSepLength + part.length, + }, + }) + } + } + return chunks +} + +function chunkMarkdown(text: string): Chunk[] { + const chunks: Chunk[] = [] + + const sections = text.split(/^(#+\s*)/m) + let pos = 0 + for (const section of sections) { + if (section.length === 0) { + continue + } + if (section.startsWith('#')) { + pos += section.length + continue + } + chunks.push({ + text: section.trim(), + range: { start: pos, end: pos + section.length }, + }) + } + + return chunks +} diff --git a/provider/docs/src/corpus/doc/contentExtractor.test.ts b/provider/docs/src/corpus/doc/contentExtractor.test.ts new file mode 100644 index 00000000..d7b9f825 --- /dev/null +++ b/provider/docs/src/corpus/doc/contentExtractor.test.ts @@ -0,0 +1,17 @@ +import { describe, expect, test } from 'vitest' +import { type Content, extractContentUsingMozillaReadability } from './contentExtractor.ts' + +describe('extractContentUsingMozillaReadability', () => { + test('extracts content', async () => + expect( + await extractContentUsingMozillaReadability.extractContent({ + id: 1, + text: 'Bar - MySite

Bar

\n

Baz

', + }) + ).toEqual({ + title: 'Bar - MySite', + content: + '

Bar

\n

Baz

', + textContent: 'Bar\nBaz', + })) +}) diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts new file mode 100644 index 00000000..a857aae1 --- /dev/null +++ b/provider/docs/src/corpus/doc/contentExtractor.ts @@ -0,0 +1,49 @@ +import { Readability } from '@mozilla/readability' +import { parseDOM } from '../../dom.ts' +import type { Doc } from './doc.ts' + +export interface Content { + /** + * Title of the document. + */ + title: string + + /** + * Content of the document, including some markup. Omits non-content-related elements (header, + * footer, navigation, etc.). + */ + content: string + + /** + * Text content of the document, with all markup removed. Omits all non-content-related + * elements. + */ + textContent: string +} + +export interface ContentExtractor { + /** + * The ID of the content extractor is used as a cache key for its output. Change the ID to + * invalidate previously cached data when the chunker implementation changes significantly. + */ + id: string + + extractContent(doc: Doc): Promise +} + +export const extractContentUsingMozillaReadability: ContentExtractor = { + id: 'mozillaReadability', + async extractContent(doc) { + const dom = await parseDOM(doc.text, doc.url) + const info = new Readability(dom, { + charThreshold: 500, + }).parse() + return info + ? { + title: dom.title, + content: info.content, + textContent: info.textContent, + } + : null + }, +} diff --git a/provider/docs/src/corpus/doc/doc.ts b/provider/docs/src/corpus/doc/doc.ts new file mode 100644 index 00000000..a4a462a3 --- /dev/null +++ b/provider/docs/src/corpus/doc/doc.ts @@ -0,0 +1,14 @@ +/** + * A unique identifier for a document in a corpus. + */ +export type DocID = number + +/** + * A raw document in a corpus. + */ +export interface Doc { + id: DocID + text: string + + url?: string +} diff --git a/provider/docs/src/corpus/index/corpusIndex.test.ts b/provider/docs/src/corpus/index/corpusIndex.test.ts new file mode 100644 index 00000000..d9c8fb9b --- /dev/null +++ b/provider/docs/src/corpus/index/corpusIndex.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, test } from 'vitest' +import { createCorpusArchive } from '../archive/corpusArchive.ts' +import type { Doc, DocID } from '../doc/doc.ts' +import { createCorpusIndex, fromJSON } from './corpusIndex.ts' + +export function doc(id: DocID, text: string): Doc { + return { id, text } +} + +describe('indexCorpus', async () => { + const INDEX = await createCorpusIndex(await createCorpusArchive([doc(1, 'a'), doc(2, 'b')])) + + test('docs', () => { + expect(INDEX.docs.length).toBe(2) + }) + + test('JSON-serializable', () => { + const serialized = fromJSON(JSON.parse(JSON.stringify(INDEX))) + const indexWithoutToJSON = { ...INDEX } + ;(indexWithoutToJSON as any).toJSON = undefined + expect(serialized.docs[0].chunks[0].embeddings).toBeInstanceOf(Float32Array) + expect(serialized.docs).toEqual(INDEX.docs) + }) +}) diff --git a/provider/docs/src/corpus/index/corpusIndex.ts b/provider/docs/src/corpus/index/corpusIndex.ts new file mode 100644 index 00000000..ba47bb4f --- /dev/null +++ b/provider/docs/src/corpus/index/corpusIndex.ts @@ -0,0 +1,108 @@ +import { embedText } from '../../search/embeddings.ts' +import { type TFIDFIndex, createTFIDFIndex } from '../../search/tfidf.ts' +import type { CorpusArchive } from '../archive/corpusArchive.ts' +import { contentID } from '../cache/contentID.ts' +import { type Chunk, chunk } from '../doc/chunks.ts' +import type { Content, ContentExtractor } from '../doc/contentExtractor.ts' +import type { Doc } from '../doc/doc.ts' + +/** + * An index of a corpus. + */ +export interface CorpusIndex { + // Index data + docs: IndexedDoc[] + tfidf: TFIDFIndex +} + +/** + * An indexed document. + */ +export interface IndexedDoc { + doc: Pick + content: Pick | null + + /** The SHA-256 hash of the indexed content (including chunks). */ + contentID: string + + chunks: (Chunk & { embeddings: Float32Array })[] +} + +/** + * Index a corpus. + */ +export async function createCorpusIndex( + archive: CorpusArchive, + { contentExtractor }: { contentExtractor?: ContentExtractor } = {} +): Promise { + const docs = await indexCorpusDocs(archive, { contentExtractor }) + const tfidf = createTFIDFIndex(docs) + const index: CorpusIndex = { + docs, + tfidf, + } + const serializable = { + ...index, + /** Handles serializing the Float32Array values. */ + toJSON: () => toJSON(index), + } + return serializable +} + +async function indexCorpusDocs( + corpus: CorpusArchive, + { contentExtractor }: { contentExtractor?: ContentExtractor } +): Promise { + return Promise.all( + corpus.docs.map(async doc => { + const content = contentExtractor ? await contentExtractor.extractContent(doc) : null + const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') }) + return { + doc: { id: doc.id, url: doc.url }, + content: + content?.title && content?.textContent + ? { title: content.title, textContent: content.textContent } + : null, + contentID: await contentID(JSON.stringify([doc, content, chunks])), + chunks: await Promise.all( + chunks.map(async chunk => ({ + ...chunk, + embeddings: await embedText(chunk.text), + })) + ), + } satisfies IndexedDoc + }) + ) +} + +function toJSON(index: CorpusIndex): any { + return { + ...index, + docs: index.docs.map(doc => ({ + ...doc, + chunks: doc.chunks.map(chunk => ({ + ...chunk, + embeddings: Array.from(chunk.embeddings), + })), + })), + } +} + +/** + * Must be called on any {@link CorpusIndex} value that was deserialized using `JSON.parse`. + */ +export function fromJSON(indexData: any): CorpusIndex { + return { + ...indexData, + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access + docs: indexData.docs.map((doc: any) => ({ + ...doc, + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access + chunks: doc.chunks.map((chunk: any) => ({ + ...chunk, + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + embeddings: new Float32Array(chunk.embeddings), + })), + })), + } +} diff --git a/provider/docs/src/dom.ts b/provider/docs/src/dom.ts new file mode 100644 index 00000000..95efcebb --- /dev/null +++ b/provider/docs/src/dom.ts @@ -0,0 +1,23 @@ +export type ParseDOM = (html: string, url: string | undefined) => Promise + +/** + * Parse DOM (works in both Node and browser). + */ +export const parseDOM: ParseDOM = + typeof DOMParser === 'undefined' + ? async (html, url) => { + const { JSDOM } = await import('jsdom') + return new JSDOM(html, { url }).window.document + } + : (html, url) => { + const document = new DOMParser().parseFromString(html, 'text/html') + + // Set base URL. + if (url && document.head.querySelectorAll('base').length === 0) { + const baseEl = document.createElement('base') + baseEl.setAttribute('href', url) + document.head.append(baseEl) + } + + return Promise.resolve(document) + } diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts new file mode 100644 index 00000000..e48d8828 --- /dev/null +++ b/provider/docs/src/e2e.test.ts @@ -0,0 +1,40 @@ +import fs from 'node:fs/promises' +import path from 'node:path' +import { describe, expect, test } from 'vitest' +import { createClient } from './client/client.ts' +import { createCorpusArchive } from './corpus/archive/corpusArchive.ts' +import { createCorpusIndex } from './corpus/index/corpusIndex.ts' +import type { SearchResult } from './search/types.ts' + +describe('e2e', () => { + test('urlParsing', async () => { + const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') + const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') + + const index = await createCorpusIndex(await createCorpusArchive([{ id: 1, text: docFile }])) + const client = createClient(index) + const results = await client.search({ text: codeFile }) + roundScores(results) + expect(results.slice(0, 1)).toEqual([ + { + doc: 1, + chunk: 3, + excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.', + score: 1.069, + scores: { + embeddingsSearch: 0.662, + keywordSearch: 0.407, + }, + }, + ]) + }) +}) + +function roundScores(results: SearchResult[]) { + for (const result of results) { + result.score = Math.round(result.score * 1000) / 1000 + for (const [searchMethod, score] of Object.entries(result.scores)) { + result.scores[searchMethod] = Math.round(score * 1000) / 1000 + } + } +} diff --git a/provider/docs/src/env.ts b/provider/docs/src/env.ts new file mode 100644 index 00000000..e03eadbe --- /dev/null +++ b/provider/docs/src/env.ts @@ -0,0 +1,3 @@ +export const isWebWindowRuntime = typeof window !== 'undefined' + +export const useWebWorker = isWebWindowRuntime diff --git a/provider/docs/src/logger.ts b/provider/docs/src/logger.ts new file mode 100644 index 00000000..a42e4737 --- /dev/null +++ b/provider/docs/src/logger.ts @@ -0,0 +1 @@ +export type Logger = (message: string) => void diff --git a/provider/docs/src/polyfill1.js b/provider/docs/src/polyfill1.js new file mode 100644 index 00000000..7ab7bf68 --- /dev/null +++ b/provider/docs/src/polyfill1.js @@ -0,0 +1,6 @@ +export const import_meta_url = + typeof document === 'undefined' + ? new (require('url'.replace('', '')).URL)('file:' + __filename).href + : // biome-ignore lint/complexity/useOptionalChain: + (document.currentScript && document.currentScript.src) || + new URL('main.js', document.baseURI).href diff --git a/provider/docs/src/provider/multiplex.ts b/provider/docs/src/provider/multiplex.ts new file mode 100644 index 00000000..90555976 --- /dev/null +++ b/provider/docs/src/provider/multiplex.ts @@ -0,0 +1,28 @@ +import type { Provider } from '@openctx/provider' +import { LRUCache } from 'lru-cache' + +/** + * @template S The settings type. + */ +export function multiplex( + createProvider: (settings: S) => Promise> +): Provider { + const providerCache = new LRUCache>>({ max: 10 }) + + function getProvider(settings: S): Promise> { + const key = JSON.stringify(settings) + let provider = providerCache.get(key) + if (!provider) { + provider = createProvider(settings) + providerCache.set(key, provider) + } + return provider + } + + return { + meta: (params, settings) => getProvider(settings).then(p => p.meta(params, settings)), + items: (params, settings) => getProvider(settings).then(p => p.items?.(params, settings) ?? []), + annotations: (params, settings) => + getProvider(settings).then(p => p.annotations?.(params, settings) ?? []), + } +} diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts new file mode 100644 index 00000000..e4d2bed6 --- /dev/null +++ b/provider/docs/src/provider/provider.ts @@ -0,0 +1,146 @@ +import type { + ItemsParams, + ItemsResult, + MentionsParams, + MentionsResult, + MetaResult, +} from '@openctx/provider' +import { createClient } from '../client/client.ts' +import type { DocID } from '../corpus/doc/doc.ts' +import { type CorpusIndex, type IndexedDoc, fromJSON } from '../corpus/index/corpusIndex.ts' +import type { SearchResult } from '../search/types.ts' +import { multiplex } from './multiplex.ts' + +/** Settings for the docs OpenCtx provider. */ +export interface Settings { + index: string +} + +/** + * An [OpenCtx](https://openctx.org) provider that adds contextual documentation to your + * code from an existing documentation corpus. + */ +export default multiplex(async settings => { + const index = await fetchIndex(settings.index) + const client = createClient(index, { logger: console.debug }) + + return { + meta(): MetaResult { + return { + name: 'docs', + features: { mentions: true }, + } + }, + + async mentions(params: MentionsParams): Promise { + const query = params.query?.trim() + const results = query ? await client.search({ text: query }) : client.docs + + const mentions: MentionsResult = [] + const seenDocIDs = new Set() + for (const result of results) { + const doc = isSearchResult(result) ? client.doc(result.doc) : result + if (seenDocIDs.has(doc.doc.id)) { + continue + } + seenDocIDs.add(doc.doc.id) + + const uri = doc.doc?.url + if (uri) { + mentions.push({ + title: doc.content?.title || doc.doc?.url || 'Untitled', + uri, + data: { textContent: doc.content?.textContent }, + }) + } + } + + if (mentions.length >= 2) { + // Trim common suffix (which is often the name of the doc site, like " - My Doc + // Site"). + const suffix = longestCommonSuffix(mentions.map(r => r.title)) + if (suffix) { + for (const r of mentions) { + // Don't trim suffix if it would result in an empty or very short string. + if (r.title.length >= suffix.length + 10) { + r.title = r.title.slice(0, -1 * suffix.length) + } + } + } + } + + // Truncate titles. Do this after trimming common suffixes, or else no common suffix + // will be found if any titles were truncated. + for (const r of mentions) { + r.title = truncate(r.title, 50) + } + + return mentions + }, + + async items(params: ItemsParams): Promise { + if (params.mention) { + return [ + { + title: params.mention.title, + url: params.mention.uri, + ai: { content: params.mention.data?.textContent as string | undefined }, + }, + ] + } + return [] + }, + } +}) + +function isSearchResult(value: SearchResult | IndexedDoc): value is SearchResult { + return 'doc' in value && typeof value.doc === 'number' +} + +async function fetchIndex(urlStr: string): Promise { + const url = new URL(urlStr) + if (url.protocol === 'file:') { + const { readFile } = await import('node:fs/promises') + return fromJSON(JSON.parse(await readFile(url.pathname, 'utf-8'))) + } + + const resp = await fetch(urlStr) + if (!resp.ok) { + throw new Error(`Failed to fetch corpus index from ${urlStr} with HTTP status ${resp.status}`) + } + return fromJSON(await resp.json()) +} + +function longestCommonSuffix(texts: string[]): string { + if (texts.length === 0) { + return '' + } + if (texts.length === 1) { + return texts[0] + } + + const minLen = Math.min(...texts.map(text => text.length)) + let suffix = '' + + for (let i = 0; i < minLen; i++) { + // Get the current character from the end of the first string. + const currentChar = texts[0][texts[0].length - 1 - i] + + // Check if this character is present at the same position from the end in all strings. + if (texts.every(text => text[text.length - 1 - i] === currentChar)) { + // If so, prepend it to the result. + suffix = currentChar + suffix + } else { + break + } + } + + return suffix +} + +function truncate(text: string, maxLength: number): string { + if (text.length > maxLength) { + return text.slice(0, maxLength) + '...' + } + return text +} diff --git a/provider/docs/src/search/embeddings.test.ts b/provider/docs/src/search/embeddings.test.ts new file mode 100644 index 00000000..825b0dd1 --- /dev/null +++ b/provider/docs/src/search/embeddings.test.ts @@ -0,0 +1,35 @@ +import { describe, expect, test } from 'vitest' +import { createCorpusArchive } from '../corpus/archive/corpusArchive.ts' +import { doc } from '../corpus/index/corpusIndex.test.ts' +import { createCorpusIndex } from '../corpus/index/corpusIndex.ts' +import { embedTextInThisScope, embeddingsSearch, similarity } from './embeddings.ts' + +describe('embeddingsSearch', () => { + test('finds matches', async () => { + expect( + await embeddingsSearch( + await createCorpusIndex(await createCorpusArchive([doc(1, 'xxxxxx'), doc(2, 'b')])), + { + text: 'b', + } + ) + ).toEqual([ + { doc: 2, chunk: 0, score: 1, excerpt: 'b' }, + { doc: 1, chunk: 0, score: 0.23823869524750682, excerpt: 'xxxxxx' }, + ]) + }) +}) + +describe('embedText', () => { + test('embeds', async () => { + const s = await embedTextInThisScope('hello world') + expect(s).toBeInstanceOf(Float32Array) + }) +}) + +describe('similarity', () => { + test('works', async () => { + expect(await similarity('what is the current time', 'what time is it')).toBeCloseTo(0.7217, 4) + expect(await similarity('hello world', 'seafood')).toBeCloseTo(0.2025, 4) + }) +}) diff --git a/provider/docs/src/search/embeddings.ts b/provider/docs/src/search/embeddings.ts new file mode 100644 index 00000000..113549bb --- /dev/null +++ b/provider/docs/src/search/embeddings.ts @@ -0,0 +1,128 @@ +import { cos_sim, dot, env, magnitude, pipeline } from '@xenova/transformers' +import * as onnxWeb from 'onnxruntime-web' +import type { CorpusIndex } from '../corpus/index/corpusIndex.ts' +import { isWebWindowRuntime, useWebWorker } from '../env.ts' +import type { Logger } from '../logger.ts' +import { embedTextOnWorker } from '../worker/webWorkerClient.ts' +import { withoutCodeStopwords } from './terms.ts' +import type { Query, SearchResult } from './types.ts' + +if (typeof process !== 'undefined' && process.env.VITEST) { + // Workaround (from + // https://github.com/microsoft/onnxruntime/issues/16622#issuecomment-1626413333) for when + // Vitest is running tests using the vmThreads pool. + const origIsArray = Array.isArray + Array.isArray = (arg): arg is any[] => { + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + if (arg?.constructor?.name === 'Float32Array' || arg?.constructor?.name === 'BigInt64Array') { + return true + } + return origIsArray(arg) + } +} + +// TODO(sqs): think we can remove this entirely... +// +// eslint-disable-next-line @typescript-eslint/prefer-optional-chain +if (typeof process !== 'undefined' && process.env.FORCE_WASM) { + // Force use of wasm backend for parity between Node.js and web. + // + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore + env.onnx = onnxWeb.env + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + ;(env as any).onnx.wasm.numThreads = 1 +} + +if (isWebWindowRuntime) { + // Running on Web. + // + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + env.backends.onnx.wasm.wasmPaths = import.meta.resolve( + '../../node_modules/@xenova/transformers/dist/' + ) +} else if (typeof __dirname !== 'undefined') { + // TODO(sqs): seems to be triggered when running in vscode + env.backends.onnx.wasm.wasmPaths = __dirname + '/../node_modules/@xenova/transformers/dist/' + env.backends.onnx.wasm.numThreads = 1 +} + +env.allowLocalModels = false + +export async function embeddingsSearch( + index: CorpusIndex, + query: Query +): Promise[]> { + const textToEmbed = [query.meta?.activeFilename && `// ${query.meta?.activeFilename}`, query.text] + .filter((s): s is string => Boolean(s)) + .join('\n') + const queryVec = await embedText(withoutCodeStopwords(textToEmbed)) + const cosSim = cosSimWith(queryVec) + + const MIN_SCORE = 0.1 + + const results: SearchResult[] = index.docs + .flatMap(({ doc: { id: docID }, chunks }) => + chunks.map((chunk, i) => { + const score = cosSim(chunk.embeddings) + return score >= MIN_SCORE + ? ({ doc: docID, chunk: i, score, excerpt: chunk.text } satisfies Omit< + SearchResult, + 'scores' + >) + : null + }) + ) + .filter((r): r is SearchResult => r !== null) + .toSorted((a, b) => b.score - a.score) + + return results +} + +const pipe = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {}) + +/** + * Embed the text and return the vector. Run in a worker in some environments. + */ +export const embedText = useWebWorker ? embedTextOnWorker : embedTextInThisScope + +/** + * Embed the text and return the vector. + * + * Run in the current scope (instead of in a worker in some environments). + */ +export async function embedTextInThisScope(text: string, logger?: Logger): Promise { + try { + const t0 = performance.now() + + const out = await (await pipe)(text, { pooling: 'mean', normalize: true }) + logger?.(`embedText (${text.length} chars) took ${Math.round(performance.now() - t0)}ms`) + + return out.data as Float32Array // TODO(sqs): cast + } catch (error) { + console.log(error) + throw error + } +} + +function cosSimWith(a: Float32Array): (b: Float32Array) => number { + const mA = magnitude(a) + return b => dot(a, b) / (mA * magnitude(b)) +} + +/** + * Compute the cosine similarity of the two texts' embeddings vectors. + */ +export async function similarity(text1: string, text2: string): Promise { + const emb1 = await embedTextInThisScope(text1) + const emb2 = await embedTextInThisScope(text2) + return cos_sim(emb1, emb2) +} + +declare module '@xenova/transformers' { + // These functions are declared in the @xenova/transformers module as only accepting + // number[], but they accept Float32Array as well. + export function cos_sim(a: Float32Array, b: Float32Array): number + export function dot(a: Float32Array, b: Float32Array): number + export function magnitude(arr: Float32Array): number +} diff --git a/provider/docs/src/search/keyword.test.ts b/provider/docs/src/search/keyword.test.ts new file mode 100644 index 00000000..85191480 --- /dev/null +++ b/provider/docs/src/search/keyword.test.ts @@ -0,0 +1,32 @@ +import { describe, expect, test } from 'vitest' +import { createCorpusArchive } from '../corpus/archive/corpusArchive.ts' +import { doc } from '../corpus/index/corpusIndex.test.ts' +import { createCorpusIndex } from '../corpus/index/corpusIndex.ts' +import { keywordSearch } from './keyword.ts' +import { calculateTFIDF } from './tfidf.ts' +import type { SearchResult } from './types.ts' + +describe('keywordSearch', () => { + test('finds matches', async () => { + expect( + keywordSearch( + await createCorpusIndex(await createCorpusArchive([doc(1, 'aaa'), doc(2, 'bbb')])), + { + text: 'bbb', + } + ) + ).toEqual[]>([ + { + doc: 2, + chunk: 0, + score: calculateTFIDF({ + termOccurrencesInChunk: 1, + chunkTermLength: 1, + totalChunks: 2, + termChunkFrequency: 1, + }), + excerpt: 'bbb', + }, + ]) + }) +}) diff --git a/provider/docs/src/search/keyword.ts b/provider/docs/src/search/keyword.ts new file mode 100644 index 00000000..72035aa3 --- /dev/null +++ b/provider/docs/src/search/keyword.ts @@ -0,0 +1,25 @@ +import type { CorpusIndex } from '../corpus/index/corpusIndex.ts' +import { terms } from './terms.ts' +import { computeTFIDF } from './tfidf.ts' +import type { Query, SearchResult } from './types.ts' + +export function keywordSearch(index: CorpusIndex, query: Query): Omit[] { + const queryTerms = terms(query.text).filter(term => term.length >= 3) + + const results: Omit[] = [] + for (const { + doc: { id: docID }, + chunks, + } of index.docs) { + for (const [i, chunk] of chunks.entries()) { + const score = queryTerms.reduce( + (score, term) => score + computeTFIDF(term, docID, i, index.tfidf), + 0 + ) + if (score > 0) { + results.push({ doc: docID, chunk: i, score, excerpt: chunk.text }) + } + } + } + return results +} diff --git a/provider/docs/src/search/terms.test.ts b/provider/docs/src/search/terms.test.ts new file mode 100644 index 00000000..71a1bc50 --- /dev/null +++ b/provider/docs/src/search/terms.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, test } from 'vitest' +import { terms } from './terms.ts' + +describe('terms', () => { + test('splits, stems, normalizes', () => { + expect(terms('my apples are cooler when = stored, oh my')).toEqual([ + 'apple', + 'cool', + 'stor', + 'oh', + ]) + }) +}) diff --git a/provider/docs/src/search/terms.ts b/provider/docs/src/search/terms.ts new file mode 100644 index 00000000..540b2a46 --- /dev/null +++ b/provider/docs/src/search/terms.ts @@ -0,0 +1,181 @@ +export type Term = string + +/** + * All terms in the text, with normalization and stemming applied. + */ +export function terms(text: string): Term[] { + return ( + tokens(text.toLowerCase()) + .filter(term => !stopwords.has(term)) + // TODO(sqs): get a real stemmer + .map(term => term.replace(/(.*)(?:es|ed|ing|s|er)$/, '$1')) + ) +} + +function tokens(text: string): string[] { + return text.split(/[^\w-]+/) +} + +export function withoutCodeStopwords(text: string): string { + return tokens(text) + .filter(term => !CODE_STOPWORDS.includes(term)) + .join(' ') +} + +const CODE_STOPWORDS = [ + 'class', + 'type', + 'interface', + 'extends', + 'implements', + 'props', + 'package', + 'function', + 'export', + 'import', + 'const', + 'let', + 'var', + 'for', + 'while', + 'if', + 'else', + 'then', + 'func', + 'declare', + 'return', + 'null', + 'undefined', + 'from', +] + +const stopwords = new Set([ + 'i', + 'me', + 'my', + 'myself', + 'we', + 'our', + 'ours', + 'ourselves', + 'you', + 'your', + 'yours', + 'yourself', + 'yourselves', + 'he', + 'him', + 'his', + 'himself', + 'she', + 'her', + 'hers', + 'herself', + 'it', + 'its', + 'itself', + 'they', + 'them', + 'their', + 'theirs', + 'themselves', + 'what', + 'which', + 'who', + 'whom', + 'this', + 'that', + 'these', + 'those', + 'am', + 'is', + 'are', + 'was', + 'were', + 'be', + 'been', + 'being', + 'have', + 'has', + 'had', + 'having', + 'do', + 'does', + 'did', + 'doing', + 'a', + 'an', + 'the', + 'and', + 'but', + 'if', + 'or', + 'because', + 'as', + 'until', + 'while', + 'of', + 'at', + 'by', + 'for', + 'with', + 'about', + 'against', + 'between', + 'into', + 'through', + 'during', + 'before', + 'after', + 'above', + 'below', + 'to', + 'from', + 'up', + 'down', + 'in', + 'out', + 'on', + 'off', + 'over', + 'under', + 'again', + 'further', + 'then', + 'once', + 'here', + 'there', + 'when', + 'where', + 'why', + 'how', + 'all', + 'any', + 'both', + 'each', + 'few', + 'more', + 'most', + 'other', + 'some', + 'such', + 'no', + 'nor', + 'not', + 'only', + 'own', + 'same', + 'so', + 'than', + 'too', + 'very', + 's', + 't', + 'can', + 'will', + 'just', + 'don', + 'should', + 'now', + ...CODE_STOPWORDS, +]) diff --git a/provider/docs/src/search/tfidf.test.ts b/provider/docs/src/search/tfidf.test.ts new file mode 100644 index 00000000..77846164 --- /dev/null +++ b/provider/docs/src/search/tfidf.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, test } from 'vitest' +import { createCorpusArchive } from '../corpus/archive/corpusArchive.ts' +import { createCorpusIndex } from '../corpus/index/corpusIndex.ts' +import { calculateTFIDF, computeTFIDF, createTFIDFIndex } from './tfidf.ts' + +describe('createTFIDFIndex', async () => { + const data = await createCorpusArchive([ + { id: 1, text: 'aa b c c c' }, + { id: 2, text: 'b c d' }, + { id: 3, text: 'c d e' }, + ]) + const docIDs = data.docs.map(({ id }) => id) + const index = await createCorpusIndex(data) + const tfidfIndex = createTFIDFIndex(index.docs) + + test('term in 1 doc', () => { + expect(docIDs.map(docID => computeTFIDF('aa', docID, 0, tfidfIndex))).toEqual([ + calculateTFIDF({ + termOccurrencesInChunk: 1, + chunkTermLength: 5, + totalChunks: 3, + termChunkFrequency: 1, + }), + 0, + 0, + ]) + }) + + test('term in all docs', () => { + expect(docIDs.map(docID => computeTFIDF('c', docID, 0, tfidfIndex))).toEqual([ + calculateTFIDF({ + termOccurrencesInChunk: 3, + chunkTermLength: 5, + totalChunks: 3, + termChunkFrequency: 3, + }), + calculateTFIDF({ + termOccurrencesInChunk: 1, + chunkTermLength: 3, + totalChunks: 3, + termChunkFrequency: 3, + }), + calculateTFIDF({ + termOccurrencesInChunk: 1, + chunkTermLength: 3, + totalChunks: 3, + termChunkFrequency: 3, + }), + ]) + }) + + test('unknown term', () => { + expect(docIDs.map(docID => computeTFIDF('x', docID, 0, tfidfIndex))).toEqual([0, 0, 0]) + }) +}) diff --git a/provider/docs/src/search/tfidf.ts b/provider/docs/src/search/tfidf.ts new file mode 100644 index 00000000..9d233e1d --- /dev/null +++ b/provider/docs/src/search/tfidf.ts @@ -0,0 +1,141 @@ +import type { ChunkIndex } from '../corpus/doc/chunks.ts' +import type { DocID } from '../corpus/doc/doc.ts' +import type { IndexedDoc } from '../corpus/index/corpusIndex.ts' +import { type Term, terms } from './terms.ts' + +/** + * Index the corpus for fast computation of TF-IDF. + * + * TF-IDF is a way of measuring the relevance of a term to a document in a corpus. See + * https://en.wikipedia.org/wiki/Tf%E2%80%93idf. + * + * TF-IDF = TF * IDF + * - TF = number of occurrences of term in the chunk / number of (non-unique) terms in the chunk + * - IDF = log(number of chunks / number of chunks containing the term) + */ +export function createTFIDFIndex(docs: IndexedDoc[]): TFIDFIndex { + /** + * DocID -> chunk index -> term -> number of occurrences of term in the chunk. + * + * "TF" in "TF-IDF" (with chunks instead of documents as the unit of analysis). + */ + const termFrequency: Record[][] = [] + + /** + * DocID -> chunk index -> number of (non-unique) terms in the chunk. + */ + const termLength: number[][] = [] + + /** + * Term -> number of chunks containing the term. + * + * "DF" in "IDF" in "TF-IDF" (with chunks instead of documents as the unit of analysis). + */ + const chunkFrequency: Record = {} + + let totalChunks = 0 + + for (const { + doc: { id: docID }, + chunks, + } of docs) { + const docTermFrequency: Record[] = new Array>(chunks.length) + termFrequency[docID] = docTermFrequency + + const docTermLength: number[] = new Array(chunks.length) + termLength[docID] = docTermLength + + for (const [i, chunk] of chunks.entries()) { + const chunkTerms = terms(chunk.text) + + // Set chunk frequencies. + for (const uniqueTerm of new Set(chunkTerms).values()) { + chunkFrequency[uniqueTerm] = (chunkFrequency[uniqueTerm] ?? 0) + 1 + } + + // Set term frequencies. + const chunkTermFrequency: Record = {} + docTermFrequency[i] = chunkTermFrequency + for (const term of chunkTerms) { + chunkTermFrequency[term] = (chunkTermFrequency[term] ?? 0) + 1 + } + + // Set term cardinality. + docTermLength[i] = chunkTerms.length + + // Increment total chunks. + totalChunks++ + } + } + + return { + termFrequency, + termLength, + totalChunks, + chunkFrequency, + } +} + +/** + * An index that can be used to compute TF-IDF for a term. Create the index with + * {@link createTFIDFIndex}. + */ +export interface TFIDFIndex { + termFrequency: Record[][] + termLength: number[][] + totalChunks: number + chunkFrequency: Record +} + +/** + * Compute the TF-IDF for a term in a document chunk using an index created by + * {@link createTFIDFIndex}. + */ +export function computeTFIDF(term: Term, doc: DocID, chunk: ChunkIndex, index: TFIDFIndex): number { + const docTermLength = index.termLength[doc] + if (!docTermLength) { + throw new Error(`doc ${doc} not found in termLength`) + } + if (typeof docTermLength[chunk] !== 'number') { + throw new TypeError(`chunk ${chunk} not found in termLength for doc ${doc}`) + } + + const docTermFrequency = index.termFrequency[doc] + if (!docTermFrequency) { + throw new Error(`doc ${doc} not found in termFrequency`) + } + if (typeof docTermFrequency[chunk] !== 'object') { + throw new TypeError(`chunk ${chunk} not found in termFrequency for doc ${doc}`) + } + + return calculateTFIDF({ + termOccurrencesInChunk: docTermFrequency[chunk][term] ?? 0, + chunkTermLength: docTermLength[chunk], + totalChunks: index.totalChunks, + termChunkFrequency: index.chunkFrequency[term] ?? 0, + }) +} + +export type TFIDF = (term: Term, doc: DocID, chunk: ChunkIndex) => number + +/** + * Calculate TF-IDF given the formula inputs. @see {createTFIDFIndex} + * + * Use {@link createTFIDFIndex} instead of calling this directly. + */ +export function calculateTFIDF({ + termOccurrencesInChunk, + chunkTermLength, + totalChunks, + termChunkFrequency, +}: { + termOccurrencesInChunk: number + chunkTermLength: number + totalChunks: number + termChunkFrequency: number +}): number { + return ( + (termOccurrencesInChunk / chunkTermLength) * + Math.log((1 + totalChunks) / (1 + termChunkFrequency)) + ) +} diff --git a/provider/docs/src/search/types.ts b/provider/docs/src/search/types.ts new file mode 100644 index 00000000..536c25c6 --- /dev/null +++ b/provider/docs/src/search/types.ts @@ -0,0 +1,26 @@ +import type { ChunkIndex } from '../corpus/doc/chunks.ts' +import type { DocID } from '../corpus/doc/doc.ts' + +/** A search query. */ +export interface Query { + text: string + meta?: { + activeFilename?: string + } +} + +/** + * A search result from searching a {@link CorpusIndex}. + */ +export interface SearchResult { + doc: DocID + chunk: ChunkIndex + + /** The final score after combining the individual scores from different search methods. */ + score: number + + /** Scores from all search methods that returned this result. */ + scores: { [searchMethod: string]: number } + + excerpt: string +} diff --git a/provider/docs/src/testdata/code/urlParsing.ts b/provider/docs/src/testdata/code/urlParsing.ts new file mode 100644 index 00000000..8a7c652f --- /dev/null +++ b/provider/docs/src/testdata/code/urlParsing.ts @@ -0,0 +1,6 @@ +// @ts-nocheck + +function getAudio(title: string): URL { + const audioFile = searchAudioFiles(title) + return parseAudioURL(audioFile.url) +} diff --git a/provider/docs/src/testdata/corpus/urlParsing.md b/provider/docs/src/testdata/corpus/urlParsing.md new file mode 100644 index 00000000..864548eb --- /dev/null +++ b/provider/docs/src/testdata/corpus/urlParsing.md @@ -0,0 +1,15 @@ +# URL parsing + +To parse a URL, use the `parseURL` function. + +## Image URL parsing + +To parse an image URL, use the `parseImageURL` function. + +## Video URL parsing + +To parse an image URL, use the `parseVideoURL` function. + +## Audio URL parsing + +To parse an audio URL, use the `parseAudioURL` function. diff --git a/provider/docs/src/worker/api.ts b/provider/docs/src/worker/api.ts new file mode 100644 index 00000000..c55b17d3 --- /dev/null +++ b/provider/docs/src/worker/api.ts @@ -0,0 +1,11 @@ +export interface WorkerMessagePair< + T extends string = string, + A extends {} = Record, + R extends {} = Record, +> { + type: T + request: { id: number; type: T; args: A } + response: { id: number; result: R } +} + +export interface WorkerEmbedTextMessage extends WorkerMessagePair<'embedText', string, Float32Array> {} diff --git a/provider/docs/src/worker/webWorker.ts b/provider/docs/src/worker/webWorker.ts new file mode 100644 index 00000000..9dea4b06 --- /dev/null +++ b/provider/docs/src/worker/webWorker.ts @@ -0,0 +1,28 @@ +/// + +import { embedTextInThisScope } from '../search/embeddings.ts' +import type { WorkerEmbedTextMessage, WorkerMessagePair } from './api.ts' + +declare let self: DedicatedWorkerGlobalScope + +onRequest( + 'embedText', + async (text: string): Promise => embedTextInThisScope(text, console.debug) +) + +// Tell our host we are ready. +self.postMessage('ready') + +function onRequest

( + type: P['type'], + handler: (args: P['request']['args']) => Promise +): void { + // eslint-disable-next-line @typescript-eslint/no-misused-promises + self.addEventListener('message', async event => { + const request = event.data as P['request'] + if (request.type === type) { + const response: P['response'] = { id: request.id, result: await handler(request.args) } + self.postMessage(response) + } + }) +} diff --git a/provider/docs/src/worker/webWorkerClient.ts b/provider/docs/src/worker/webWorkerClient.ts new file mode 100644 index 00000000..27a5cf93 --- /dev/null +++ b/provider/docs/src/worker/webWorkerClient.ts @@ -0,0 +1,68 @@ +import os from 'node:os' +import type { embedTextInThisScope } from '../search/embeddings.ts' +import type { WorkerEmbedTextMessage, WorkerMessagePair } from './api.ts' + +export const embedTextOnWorker: typeof embedTextInThisScope = async ( + text: string +): Promise => sendMessage('embedText', text) + +async function sendMessage

( + type: P['type'], + args: P['request']['args'] +): Promise { + const worker = await acquireWorker() + const id = nextID() + worker.postMessage({ id, type, args } satisfies P['request']) + return new Promise(resolve => { + const onMessage = (event: MessageEvent): void => { + const response = event.data as P['response'] + if (response.id === id) { + resolve(response.result) + worker.removeEventListener('message', onMessage) + } + } + worker.addEventListener('message', onMessage) + }) +} + +const NUM_WORKERS: number = Math.min( + 8, + ((): number => { + if (typeof navigator !== 'undefined') { + return navigator.hardwareConcurrency + } + try { + return os.cpus().length + // eslint-disable-next-line no-empty + } catch {} + return 1 + })() || 1 +) + +const workers: (Promise | undefined)[] = [] +let workerSeq = 0 + +/** + * Acquire a worker from the pool. Currently the acquisition is round-robin. + */ +async function acquireWorker(): Promise { + const workerID = workerSeq++ % NUM_WORKERS + let workerInstance = workers[workerID] + if (!workerInstance) { + workerInstance = new Promise(resolve => { + const worker = new Worker(new URL('./webWorker.ts', import.meta.url), { type: 'module' }) + + // Wait for worker to become ready. It sends a message when it is ready. The actual message + // doesn't matter. + worker.addEventListener('message', () => resolve(worker)) + }) + console.log('worker', workerID, 'is ready') + workers[workerID] = workerInstance + } + return workerInstance +} + +let id = 1 +function nextID(): number { + return id++ +} diff --git a/provider/docs/tsconfig.json b/provider/docs/tsconfig.json new file mode 100644 index 00000000..180ec787 --- /dev/null +++ b/provider/docs/tsconfig.json @@ -0,0 +1,15 @@ +{ + "extends": "../../.config/tsconfig.base.json", + "compilerOptions": { + "module": "ESNext", + "rootDir": ".", + "outDir": "dist", + "lib": ["ESNext"], + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "emitDeclarationOnly": true, + }, + "include": ["src", "bin"], + "exclude": ["dist", "src/testdata", "vitest.config.ts"], + "references": [{ "path": "../../lib/provider" }], +} diff --git a/provider/docs/vitest.config.ts b/provider/docs/vitest.config.ts new file mode 100644 index 00000000..5bf3f1bc --- /dev/null +++ b/provider/docs/vitest.config.ts @@ -0,0 +1,3 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ test: { pool: 'vmThreads' } }) diff --git a/tsconfig.json b/tsconfig.json index 9e2082ea..c1542b89 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -27,6 +27,7 @@ { "path": "client/vscode/test/integration" }, { "path": "client/web-playground" }, { "path": "provider/google-docs" }, + { "path": "provider/docs" }, { "path": "provider/hello-world" }, { "path": "provider/links" }, { "path": "provider/notion" }, From 1e001320f5f1267e749c4b2979f4d2c21b776a94 Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Sun, 26 May 2024 17:41:48 +0800 Subject: [PATCH 2/4] wip --- pnpm-lock.yaml | 150 +++++++++++++++++++++--- provider/docs/package.json | 7 +- provider/docs/src/provider/multiplex.ts | 2 + provider/docs/src/provider/provider.ts | 4 +- 4 files changed, 143 insertions(+), 20 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 82527c8c..81611d22 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -504,7 +504,7 @@ importers: specifier: workspace:* version: link:../../lib/provider '@xenova/transformers': - specifier: ^2.13.4 + specifier: ^2.17.1 version: 2.17.1 buffer: specifier: ^6.0.3 @@ -516,15 +516,12 @@ importers: specifier: ^10.1.0 version: 10.1.0 onnxruntime-web: - specifier: '1.16' - version: 1.16.3 + specifier: ~1.18.0 + version: 1.18.0 devDependencies: '@types/jsdom': specifier: ^21.1.6 version: 21.1.6 - esbuild: - specifier: ^0.19.11 - version: 0.19.12 vitest-fetch-mock: specifier: ^0.2.2 version: 0.2.2(vitest@1.6.0) @@ -6741,7 +6738,7 @@ packages: resolution: {integrity: sha512-o/zjMZRhJxny7OyEF+Op8X+efiELC7k7yOjMzgfzVqOzXqkBkWI79YoTdOtsuWd5BWhAGAuOY/Xa6xpiaWXiNg==} engines: {node: '>= 14'} dependencies: - debug: 4.3.4(supports-color@8.1.1) + debug: 4.3.4 transitivePeerDependencies: - supports-color @@ -7774,6 +7771,17 @@ packages: ms: 2.1.3 dev: true + /debug@4.3.4: + resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==} + engines: {node: '>=6.0'} + peerDependencies: + supports-color: '*' + peerDependenciesMeta: + supports-color: + optional: true + dependencies: + ms: 2.1.2 + /debug@4.3.4(supports-color@8.1.1): resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==} engines: {node: '>=6.0'} @@ -9351,7 +9359,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.0 - debug: 4.3.4(supports-color@8.1.1) + debug: 4.3.4 transitivePeerDependencies: - supports-color @@ -9380,7 +9388,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.0 - debug: 4.3.4(supports-color@8.1.1) + debug: 4.3.4 transitivePeerDependencies: - supports-color @@ -11394,8 +11402,8 @@ packages: resolution: {integrity: sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==} dev: false - /onnxruntime-common@1.16.3: - resolution: {integrity: sha512-ZZfFzEqBf6YIGwB9PtBLESHI53jMXA+/hn+ACVUbEfPuK2xI5vMGpLPn+idpwCmHsKJNRzRwqV12K+6TQj6tug==} + /onnxruntime-common@1.18.0: + resolution: {integrity: sha512-lufrSzX6QdKrktAELG5x5VkBpapbCeS3dQwrXbN0eD9rHvU0yAWl7Ztju9FvgAKWvwd/teEKJNj3OwM6eTZh3Q==} dev: false /onnxruntime-node@1.14.0: @@ -11418,13 +11426,13 @@ packages: platform: 1.3.6 dev: false - /onnxruntime-web@1.16.3: - resolution: {integrity: sha512-8O1xCG/RcNQNYYWvdiQJSNpncVg78OVOFeV6MYs/jx++/b12oje8gYUzKqz9wR/sXiX/8TCvdyHgEjj5gQGKUg==} + /onnxruntime-web@1.18.0: + resolution: {integrity: sha512-o1UKj4ABIj1gmG7ae0RKJ3/GT+3yoF0RRpfDfeoe0huzRW4FDRLfbkDETmdFAvnJEXuYDE0YT+hhkia0352StQ==} dependencies: flatbuffers: 1.12.0 guid-typescript: 1.0.9 long: 5.2.3 - onnxruntime-common: 1.16.3 + onnxruntime-common: 1.18.0 platform: 1.3.6 protobufjs: 7.3.0 dev: false @@ -14053,6 +14061,27 @@ packages: vite: 5.1.4(@types/node@20.11.20) dev: false + /vite-node@1.6.0: + resolution: {integrity: sha512-de6HJgzC+TFzOu0NTC4RAIsyf/DY/ibWDYQUcuEA84EMHhcefTUGkjFHKKEJhQN4A+6I0u++kr3l36ZF2d7XRw==} + engines: {node: ^18.0.0 || >=20.0.0} + hasBin: true + dependencies: + cac: 6.7.14 + debug: 4.3.4 + pathe: 1.1.2 + picocolors: 1.0.0 + vite: 5.2.11 + transitivePeerDependencies: + - '@types/node' + - less + - lightningcss + - sass + - stylus + - sugarss + - supports-color + - terser + dev: true + /vite-node@1.6.0(@types/node@20.10.0): resolution: {integrity: sha512-de6HJgzC+TFzOu0NTC4RAIsyf/DY/ibWDYQUcuEA84EMHhcefTUGkjFHKKEJhQN4A+6I0u++kr3l36ZF2d7XRw==} engines: {node: ^18.0.0 || >=20.0.0} @@ -14119,6 +14148,41 @@ packages: fsevents: 2.3.3 dev: false + /vite@5.2.11: + resolution: {integrity: sha512-HndV31LWW05i1BLPMUCE1B9E9GFbOu1MbenhS58FuK6owSO5qHm7GiCotrNY1YE5rMeQSFBGmT5ZaLEjFizgiQ==} + engines: {node: ^18.0.0 || >=20.0.0} + hasBin: true + peerDependencies: + '@types/node': ^18.0.0 || >=20.0.0 + less: '*' + lightningcss: ^1.21.0 + sass: '*' + stylus: '*' + sugarss: '*' + terser: ^5.4.0 + peerDependenciesMeta: + '@types/node': + optional: true + less: + optional: true + lightningcss: + optional: true + sass: + optional: true + stylus: + optional: true + sugarss: + optional: true + terser: + optional: true + dependencies: + esbuild: 0.20.2 + postcss: 8.4.38 + rollup: 4.17.2 + optionalDependencies: + fsevents: 2.3.3 + dev: true + /vite@5.2.11(@types/node@20.10.0): resolution: {integrity: sha512-HndV31LWW05i1BLPMUCE1B9E9GFbOu1MbenhS58FuK6owSO5qHm7GiCotrNY1YE5rMeQSFBGmT5ZaLEjFizgiQ==} engines: {node: ^18.0.0 || >=20.0.0} @@ -14162,7 +14226,7 @@ packages: vitest: '>=0.16.0' dependencies: cross-fetch: 3.1.8 - vitest: 1.6.0(@types/node@20.10.0)(jsdom@23.2.0) + vitest: 1.6.0(jsdom@23.2.0) transitivePeerDependencies: - encoding dev: true @@ -14224,6 +14288,62 @@ packages: - terser dev: true + /vitest@1.6.0(jsdom@23.2.0): + resolution: {integrity: sha512-H5r/dN06swuFnzNFhq/dnz37bPXnq8xB2xB5JOVk8K09rUtoeNN+LHWkoQ0A/i3hvbUKKcCei9KpbxqHMLhLLA==} + engines: {node: ^18.0.0 || >=20.0.0} + hasBin: true + peerDependencies: + '@edge-runtime/vm': '*' + '@types/node': ^18.0.0 || >=20.0.0 + '@vitest/browser': 1.6.0 + '@vitest/ui': 1.6.0 + happy-dom: '*' + jsdom: '*' + peerDependenciesMeta: + '@edge-runtime/vm': + optional: true + '@types/node': + optional: true + '@vitest/browser': + optional: true + '@vitest/ui': + optional: true + happy-dom: + optional: true + jsdom: + optional: true + dependencies: + '@vitest/expect': 1.6.0 + '@vitest/runner': 1.6.0 + '@vitest/snapshot': 1.6.0 + '@vitest/spy': 1.6.0 + '@vitest/utils': 1.6.0 + acorn-walk: 8.3.2 + chai: 4.3.10 + debug: 4.3.4 + execa: 8.0.1 + jsdom: 23.2.0 + local-pkg: 0.5.0 + magic-string: 0.30.7 + pathe: 1.1.2 + picocolors: 1.0.0 + std-env: 3.7.0 + strip-literal: 2.0.0 + tinybench: 2.6.0 + tinypool: 0.8.4 + vite: 5.2.11 + vite-node: 1.6.0 + why-is-node-running: 2.2.2 + transitivePeerDependencies: + - less + - lightningcss + - sass + - stylus + - sugarss + - supports-color + - terser + dev: true + /vscode-uri@3.0.8: resolution: {integrity: sha512-AyFQ0EVmsOZOlAnxoFOGOq1SQDWAB7C6aqMGS23svWAllfOaxbuFvcT8D1i8z3Gyn8fraVeZNNmN6e9bxxXkKw==} dev: true diff --git a/provider/docs/package.json b/provider/docs/package.json index dd988144..24161ad3 100644 --- a/provider/docs/package.json +++ b/provider/docs/package.json @@ -15,7 +15,7 @@ "sideEffects": false, "scripts": { "bundle": "tsc --build && pnpm run -s bundle:esm && pnpm run -s bundle:cjs", - "bundle:esm": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.mjs --platform=node --format=esm --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:import.meta.url=import_meta_url --inject:src/polyfill1.js", + "bundle:esm": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.mjs --platform=node --format=esm --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:__filename='\"provider.mjs\"' --define:__dirname='\"/Users/sqs/src/github.com/sourcegraph/openctx/node_modules/.pnpm/@xenova+transformers@2.17.1/foo\"' --banner:js='import { createRequire } from \"module\";const require = createRequire(import.meta.url);'", "bundle:cjs": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.cjs --platform=node --format=cjs --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:import.meta.url=import_meta_url --inject:src/polyfill1.js", "prepublishOnly": "tsc --build --clean && npm run --silent bundle", "test": "vitest", @@ -26,15 +26,14 @@ "dependencies": { "@mozilla/readability": "^0.5.0", "@openctx/provider": "workspace:*", - "@xenova/transformers": "^2.13.4", + "@xenova/transformers": "^2.17.1", "buffer": "^6.0.3", "jsdom": "^23.2.0", "lru-cache": "^10.1.0", - "onnxruntime-web": "1.16" + "onnxruntime-web": "~1.18.0" }, "devDependencies": { "@types/jsdom": "^21.1.6", - "esbuild": "^0.19.11", "vitest-fetch-mock": "^0.2.2" } } diff --git a/provider/docs/src/provider/multiplex.ts b/provider/docs/src/provider/multiplex.ts index 90555976..4af8aa52 100644 --- a/provider/docs/src/provider/multiplex.ts +++ b/provider/docs/src/provider/multiplex.ts @@ -21,6 +21,8 @@ export function multiplex( return { meta: (params, settings) => getProvider(settings).then(p => p.meta(params, settings)), + mentions: (params, settings) => + getProvider(settings).then(p => p.mentions?.(params, settings) ?? []), items: (params, settings) => getProvider(settings).then(p => p.items?.(params, settings) ?? []), annotations: (params, settings) => getProvider(settings).then(p => p.annotations?.(params, settings) ?? []), diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index e4d2bed6..46765782 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -36,6 +36,8 @@ export default multiplex(async settings => { const query = params.query?.trim() const results = query ? await client.search({ text: query }) : client.docs + console.log('AA', results) + const mentions: MentionsResult = [] const seenDocIDs = new Set() for (const result of results) { @@ -100,7 +102,7 @@ function isSearchResult(value: SearchResult | IndexedDoc): value is SearchResult async function fetchIndex(urlStr: string): Promise { const url = new URL(urlStr) if (url.protocol === 'file:') { - const { readFile } = await import('node:fs/promises') + const { readFile } = require('node:fs/promises') return fromJSON(JSON.parse(await readFile(url.pathname, 'utf-8'))) } From e6b69e101587bf60c58551876a60dde38dadd5ed Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Sun, 26 May 2024 17:54:08 +0800 Subject: [PATCH 3/4] wip --- provider/docs/package.json | 2 +- provider/docs/src/provider/provider.ts | 10 ++++++++++ provider/docs/src/search/embeddings.ts | 4 +++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/provider/docs/package.json b/provider/docs/package.json index 24161ad3..d34554f2 100644 --- a/provider/docs/package.json +++ b/provider/docs/package.json @@ -15,7 +15,7 @@ "sideEffects": false, "scripts": { "bundle": "tsc --build && pnpm run -s bundle:esm && pnpm run -s bundle:cjs", - "bundle:esm": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.mjs --platform=node --format=esm --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:__filename='\"provider.mjs\"' --define:__dirname='\"/Users/sqs/src/github.com/sourcegraph/openctx/node_modules/.pnpm/@xenova+transformers@2.17.1/foo\"' --banner:js='import { createRequire } from \"module\";const require = createRequire(import.meta.url);'", + "bundle:esm": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.mjs --platform=node --format=esm --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:__filename='\"provider.mjs\"' --define:__dirname='\"/Users/sqs/src/github.com/sourcegraph/openctx/node_modules/.pnpm/@xenova+transformers@2.17.1/foo\"' --banner:js='import { createRequire } from \"module\";if (typeof require === \"undefined\") { globalThis.require = createRequire(import.meta.url); };'", "bundle:cjs": "esbuild src/provider/provider.ts --bundle --outfile=dist/provider.cjs --platform=node --format=cjs --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:import.meta.url=import_meta_url --inject:src/polyfill1.js", "prepublishOnly": "tsc --build --clean && npm run --silent bundle", "test": "vitest", diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index 46765782..14d089f0 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -40,6 +40,7 @@ export default multiplex(async settings => { const mentions: MentionsResult = [] const seenDocIDs = new Set() + const seenTitles = new Set() for (const result of results) { const doc = isSearchResult(result) ? client.doc(result.doc) : result if (seenDocIDs.has(doc.doc.id)) { @@ -47,6 +48,15 @@ export default multiplex(async settings => { } seenDocIDs.add(doc.doc.id) + // HACK + if (!doc.content?.title) { + continue + } + if (seenTitles.has(doc.content?.title)) { + continue + } + seenTitles.add(doc.content?.title) + const uri = doc.doc?.url if (uri) { mentions.push({ diff --git a/provider/docs/src/search/embeddings.ts b/provider/docs/src/search/embeddings.ts index 113549bb..a191b12c 100644 --- a/provider/docs/src/search/embeddings.ts +++ b/provider/docs/src/search/embeddings.ts @@ -43,7 +43,9 @@ if (isWebWindowRuntime) { ) } else if (typeof __dirname !== 'undefined') { // TODO(sqs): seems to be triggered when running in vscode - env.backends.onnx.wasm.wasmPaths = __dirname + '/../node_modules/@xenova/transformers/dist/' + //env.backends.onnx.wasm.wasmPaths = __dirname + '/../node_modules/@xenova/transformers/dist/' + env.backends.onnx.wasm.wasmPaths = + '/Users/sqs/src/github.com/sourcegraph/openctx/provider/docs/node_modules/@xenova/transformers/dist/' env.backends.onnx.wasm.numThreads = 1 } From 20e92ba4dbbd7d00007b679aea6852d4cee9f9cf Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Sun, 26 May 2024 19:58:51 +0800 Subject: [PATCH 4/4] wip --- provider/docs/src/provider/provider.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index 14d089f0..57cbaac2 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -27,7 +27,7 @@ export default multiplex(async settings => { return { meta(): MetaResult { return { - name: 'docs', + name: 'docs.anthropic.com', features: { mentions: true }, } },