diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index 3b7c5b5348e..161214fa49a 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -19,6 +19,7 @@ import { TokenizationPath } from './tokenization-subsets.js'; import LexicalModel = LexicalModelTypes.LexicalModel; import Transform = LexicalModelTypes.Transform; +import { PathInputProperties } from './search-space.js'; // May be able to "get away" with 2 & 5 or so, but having extra will likely help // with edit path stability. @@ -561,7 +562,8 @@ export class ContextTokenization { } let appliedLength = 0; - for(let tailRelativeIndex of inputTransformKeys) { + for(let i = 0; i < inputTransformKeys.length; i++) { + const tailRelativeIndex = inputTransformKeys[i]; let distribution = inputs.map((i) => ({sample: i.sample.get(tailRelativeIndex), p: i.p})); const tokenIndex = (tokenization.length - 1) + tailRelativeIndex; @@ -585,7 +587,8 @@ export class ContextTokenization { if(affectedToken.inputCount == 0 && distribution[0].sample.deleteLeft != 0) { distribution = distribution.map((mass) => ({sample: { ...mass.sample, deleteLeft: 0 }, p: mass.p })); } - affectedToken.addInput({ + + const inputSource: PathInputProperties = { segment: { trueTransform: sourceInput, transitionId: sourceInput.id, @@ -593,8 +596,13 @@ export class ContextTokenization { }, bestProbFromSet: bestProbFromSet, subsetId: tokenizationPath.inputSubsetId - }, distribution); + }; appliedLength += KMWString.length(distribution[0].sample.insert); + if(i + 1 < inputTransformKeys.length) { + inputSource.segment.end = appliedLength; + } + + affectedToken.addInput(inputSource, distribution); const tokenize = determineModelTokenizer(lexicalModel); affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts index 9db8329dd14..768ab535d9a 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts @@ -256,7 +256,7 @@ export class SearchPath implements SearchSpace { this.selectionQueue = new PriorityQueue(QUEUE_NODE_COMPARATOR, entries); } - public split(charIndex: number): [SearchSpace, SearchSpace] { + public split(charIndex: number): [SearchSpace, SearchPath] { const model = this.model; const internalSplitIndex = charIndex - (this.codepointLength - this.edgeLength); @@ -295,7 +295,13 @@ export class SearchPath implements SearchSpace { // don't append any part of it to the parent; it's actually clean. const hasActualSplit = internalSplitIndex > 0 || this.inputs?.[0].sample.deleteLeft > 0; const parent = hasActualSplit - ? new SearchPath(this.parentSpace, firstSet, this.inputSource) + ? new SearchPath(this.parentSpace, firstSet, { + ...this.inputSource, + segment: { + ...this.inputSource.segment, + end: this.inputSource.segment.start + internalSplitIndex + } + }) : this.parentSpace; // construct two SearchPath instances based on the two sets! return [ @@ -468,7 +474,12 @@ export class SearchPath implements SearchSpace { for(const source of sources) { const i = source.segment.start; - components.push(`T${source.segment.transitionId}${i != 0 ? '@' + i : ''}`); + const j = source.segment.end; + let component = (`T${source.segment.transitionId}${i != 0 || j !== undefined ? '@' + i : ''}`); + if(j) { + component = component + '-' + j; + } + components.push(component); } return components.join('+'); diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts index 8884a07a0c6..a3e9f22796e 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts @@ -60,6 +60,17 @@ export interface InputSegment { * tokenized correction-search input. */ start: number + + /** + * Marks the final index (exclusive) within the insert strings for the + * corresponding transitions' Transforms that are applied by the corresponding + * tokenized correction-search input. + * + * If undefined, there is no portion of the input-source transform split from + * the right-hand side. Otherwise, this value should match the `start` value of + * the _next_ split-off component of the input-source. + */ + end?: number; } /** diff --git a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts index 2ec0e8b856b..05f29ce903d 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts @@ -6,6 +6,7 @@ export { ContextTracker } from './correction/context-tracker.js'; export { ContextTransition } from './correction/context-transition.js'; export * from './correction/distance-modeler.js'; export * from './correction/search-path.js'; +export * from './correction/search-space.js'; export { ExtendedEditOperation, SegmentableDistanceCalculation } from './correction/segmentable-calculation.js'; export * from './correction/tokenization-subsets.js'; export * as correction from './correction/index.js'; diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts index 1a9d39d25b6..ab5938b1608 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts @@ -14,7 +14,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers'; import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'; import { LexicalModelTypes } from '@keymanapp/common-types'; -import { ContextToken, correction, generateSubsetId, getBestMatches, models, preprocessInputSources, SearchPath } from '@keymanapp/lm-worker/test-index'; +import { ContextToken, correction, generateSubsetId, getBestMatches, models, PathInputProperties, preprocessInputSources, SearchPath } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; import ExecutionTimer = correction.ExecutionTimer; @@ -464,20 +464,29 @@ describe('ContextToken', function() { assert.equal(resultsOfSplit.length, 3); assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray); - assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputSegments[0]), [0, 3, 8].map(i => ({ - segment: { - trueTransform: { - insert: 'biglargetransform', - id: 13, - deleteLeft: 0, - deleteRight: 0 + const offsets = [0, 3, 8]; + assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputSegments[0]), [0, 1, 2].map(i => { + const inputSource: PathInputProperties = { + segment: { + trueTransform: { + insert: 'biglargetransform', + id: 13, + deleteLeft: 0, + deleteRight: 0 + }, + transitionId: 13, + start: offsets[i] }, - transitionId: 13, - start: i - }, - bestProbFromSet: 1, - subsetId - }))); + bestProbFromSet: 1, + subsetId + }; + + if(offsets[i+1] !== undefined) { + inputSource.segment.end = offsets[i+1]; + } + + return inputSource; + })); for(let i = 0; i < resultsOfSplit.length; i++) { assert.isTrue(resultsOfSplit[i].searchSpace.hasInputs([ @@ -549,7 +558,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, - start: 0 + start: 0, + end: 'arge'.length }, bestProbFromSet: 1, subsetId: subsetIds[1] @@ -568,7 +578,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, - start: 0 + start: 0, + end: 'ng'.length }, bestProbFromSet: 1, subsetId: subsetIds[2] @@ -693,7 +704,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[1][0].sample, transitionId: keystrokeDistributions[1][0].sample.id, - start: 0 + start: 0, + end: 'arge'.length }, bestProbFromSet: 1, subsetId: subsetIds[1] @@ -711,7 +723,8 @@ describe('ContextToken', function() { segment: { trueTransform: keystrokeDistributions[2][0].sample, transitionId: keystrokeDistributions[2][0].sample.id, - start: 0 + start: 0, + end: 'ng'.length }, bestProbFromSet: 1, subsetId: subsetIds[2] diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index 5e50b41f50b..51799a54058 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -465,7 +465,8 @@ describe('ContextTokenization', function() { segment: { trueTransform: inputTransform, transitionId: inputTransform.id, - start: 0 + start: 0, + end: 0 }, bestProbFromSet: 1, subsetId }); @@ -476,7 +477,8 @@ describe('ContextTokenization', function() { segment: { trueTransform: inputTransform, transitionId: inputTransform.id, - start: 0 + start: 0, + end: 1 // captured the leading whitespace insert }, bestProbFromSet: 1, subsetId }]); diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts index a2a5760d30b..4a832d6feaa 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts @@ -1361,7 +1361,13 @@ describe('SearchPath', () => { assert.isTrue(tail instanceof SearchPath); assert.deepEqual((head as SearchPath).inputs, headTarget.inputs); assert.deepEqual((tail as SearchPath).inputs, tailTarget.inputs); - assert.deepEqual((head as SearchPath).inputSource, headTarget.inputSource); + assert.deepEqual((head as SearchPath).inputSource, { + ...headTarget.inputSource, + segment: { + ...headTarget.inputSource.segment, + end: 2 + } + }); assert.deepEqual((tail as SearchPath).inputSource, tailTarget.inputSource); }); });