From 6ac96d6bcf9b2a1971b81d4c1cd987cf27c2fcd7 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 5 Nov 2025 11:11:47 -0600 Subject: [PATCH 1/2] change(web): track right-hand split index for input source of tokenized transforms Build-bot: skip build:web Test-bot: skip --- .../src/main/correction/context-tokenization.ts | 14 +++++++++++--- .../src/main/correction/search-path.ts | 7 ++++++- .../src/main/correction/search-space.ts | 11 +++++++++++ .../context/context-tokenization.tests.ts | 6 ++++-- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index b641e48a1b3..27239dd0506 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -19,6 +19,7 @@ import { TokenizationPath } from './tokenization-subsets.js'; import LexicalModel = LexicalModelTypes.LexicalModel; import Transform = LexicalModelTypes.Transform; +import { PathInputProperties } from './search-space.js'; // May be able to "get away" with 2 & 5 or so, but having extra will likely help // with edit path stability. @@ -561,7 +562,8 @@ export class ContextTokenization { } let appliedLength = 0; - for(let tailRelativeIndex of inputTransformKeys) { + for(let i = 0; i < inputTransformKeys.length; i++) { + const tailRelativeIndex = inputTransformKeys[i]; let distribution = inputs.map((i) => ({sample: i.sample.get(tailRelativeIndex), p: i.p})); const tokenIndex = (tokenization.length - 1) + tailRelativeIndex; @@ -585,7 +587,8 @@ export class ContextTokenization { if(affectedToken.inputCount == 0 && distribution[0].sample.deleteLeft != 0) { distribution = distribution.map((mass) => ({sample: { ...mass.sample, deleteLeft: 0 }, p: mass.p })); } - affectedToken.addInput({ + + const inputSource: PathInputProperties = { segment: { trueTransform: sourceInput, transitionId: sourceInput.id, @@ -593,8 +596,13 @@ export class ContextTokenization { }, bestProbFromSet: bestProbFromSet, subsetId: tokenizationPath.inputSubsetId - }, distribution); + }; appliedLength += KMWString.length(distribution[0].sample.insert); + if(i + 1 < inputTransformKeys.length) { + inputSource.segment.end = appliedLength; + } + + affectedToken.addInput(inputSource, distribution); const tokenize = determineModelTokenizer(lexicalModel); affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts index 9bf2e478bc8..51a0d606263 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts @@ -471,7 +471,12 @@ export class SearchPath implements SearchSpace { for(const source of sources) { const i = source.segment.start; - components.push(`T${source.segment.transitionId}${i != 0 ? '@' + i : ''}`); + const j = source.segment.end; + let component = (`T${source.segment.transitionId}${i != 0 || j !== undefined ? '@' + i : ''}`); + if(j) { + component = component + '-' + j; + } + components.push(component); } return components.join('+'); diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts index 8884a07a0c6..a3e9f22796e 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts @@ -60,6 +60,17 @@ export interface InputSegment { * tokenized correction-search input. */ start: number + + /** + * Marks the final index (exclusive) within the insert strings for the + * corresponding transitions' Transforms that are applied by the corresponding + * tokenized correction-search input. + * + * If undefined, there is no portion of the input-source transform split from + * the right-hand side. Otherwise, this value should match the `start` value of + * the _next_ split-off component of the input-source. + */ + end?: number; } /** diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index 5e50b41f50b..51799a54058 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -465,7 +465,8 @@ describe('ContextTokenization', function() { segment: { trueTransform: inputTransform, transitionId: inputTransform.id, - start: 0 + start: 0, + end: 0 }, bestProbFromSet: 1, subsetId }); @@ -476,7 +477,8 @@ describe('ContextTokenization', function() { segment: { trueTransform: inputTransform, transitionId: inputTransform.id, - start: 0 + start: 0, + end: 1 // captured the leading whitespace insert }, bestProbFromSet: 1, subsetId }]); From b51dbc3dbcaa6f22c0f04d60551a5817d2694802 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Mon, 10 Nov 2025 16:05:12 -0600 Subject: [PATCH 2/2] change(web): enhance SearchPath.split() unit tests per new inputSplitIndex field --- .../src/main/correction/search-path.ts | 10 +++- .../worker-thread/src/main/test-index.ts | 1 + .../context/context-token.tests.ts | 49 ++++++++++++------- .../correction-search/search-path.tests.ts | 8 ++- 4 files changed, 47 insertions(+), 21 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts index 51a0d606263..71e9fa2be0c 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts @@ -259,7 +259,7 @@ export class SearchPath implements SearchSpace { this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, entries); } - public split(charIndex: number): [SearchSpace, SearchSpace] { + public split(charIndex: number): [SearchSpace, SearchPath] { const model = this.model; const internalSplitIndex = charIndex - (this.codepointLength - this.edgeLength); @@ -298,7 +298,13 @@ export class SearchPath implements SearchSpace { // don't append any part of it to the parent; it's actually clean. const hasActualSplit = internalSplitIndex > 0 || this.inputs?.[0].sample.deleteLeft > 0; const parent = hasActualSplit - ? new SearchPath(this.parentSpace, firstSet, this.inputSource) + ? new SearchPath(this.parentSpace, firstSet, { + ...this.inputSource, + segment: { + ...this.inputSource.segment, + end: this.inputSource.segment.start + internalSplitIndex + } + }) : this.parentSpace; // construct two SearchPath instances based on the two sets! return [ diff --git a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts index 2ec0e8b856b..05f29ce903d 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts @@ -6,6 +6,7 @@ export { ContextTracker } from './correction/context-tracker.js'; export { ContextTransition } from './correction/context-transition.js'; export * from './correction/distance-modeler.js'; export * from './correction/search-path.js'; +export * from './correction/search-space.js'; export { ExtendedEditOperation, SegmentableDistanceCalculation } from './correction/segmentable-calculation.js'; export * from './correction/tokenization-subsets.js'; export * as correction from './correction/index.js'; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index 1a9d39d25b6..ab5938b1608 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -14,7 +14,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { ContextToken, correction, generateSubsetId, getBestMatches, models, preprocessInputSources, SearchPath } from '@keymanapp/lm-worker/test-index'; +import { ContextToken, correction, generateSubsetId, getBestMatches, models, PathInputProperties, preprocessInputSources, SearchPath } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; import ExecutionTimer = correction.ExecutionTimer; @@ -464,20 +464,29 @@ describe('ContextToken', function() { assert.equal(resultsOfSplit.length, 3); assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); - assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputSegments[0]), [0, 3, 8].map(i => ({ - segment: { - trueTransform: { - insert: 'biglargetransform', - id: 13, - deleteLeft: 0, - deleteRight: 0 + const offsets = [0, 3, 8]; + assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputSegments[0]), [0, 1, 2].map(i => { + const inputSource: PathInputProperties = { + segment: { + trueTransform: { + insert: 'biglargetransform', + id: 13, + deleteLeft: 0, + deleteRight: 0 + }, + transitionId: 13, + start: offsets[i] }, - transitionId: 13, - start: i - }, - bestProbFromSet: 1, - subsetId - }))); + bestProbFromSet: 1, + subsetId + }; + + if(offsets[i+1] !== undefined) { + inputSource.segment.end = offsets[i+1]; + } + + return inputSource; + })); for(let i = 0; i < resultsOfSplit.length; i++) { assert.isTrue(resultsOfSplit[i].searchSpace.hasInputs([ @@ -549,7 +558,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, - start: 0 + start: 0, + end: 'arge'.length }, bestProbFromSet: 1, subsetId: subsetIds[1] @@ -568,7 +578,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, - start: 0 + start: 0, + end: 'ng'.length }, bestProbFromSet: 1, subsetId: subsetIds[2] @@ -693,7 +704,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, - start: 0 + start: 0, + end: 'arge'.length }, bestProbFromSet: 1, subsetId: subsetIds[1] @@ -711,7 +723,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, - start: 0 + start: 0, + end: 'ng'.length }, bestProbFromSet: 1, subsetId: subsetIds[2] diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts index a2a5760d30b..4a832d6feaa 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts @@ -1361,7 +1361,13 @@ describe('SearchPath', () => { assert.isTrue(tail instanceof SearchPath); assert.deepEqual((head as SearchPath).inputs, headTarget.inputs); assert.deepEqual((tail as SearchPath).inputs, tailTarget.inputs); - assert.deepEqual((head as SearchPath).inputSource, headTarget.inputSource); + assert.deepEqual((head as SearchPath).inputSource, { + ...headTarget.inputSource, + segment: { + ...headTarget.inputSource.segment, + end: 2 + } + }); assert.deepEqual((tail as SearchPath).inputSource, tailTarget.inputSource); }); });