From 6ac96d6bcf9b2a1971b81d4c1cd987cf27c2fcd7 Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Wed, 5 Nov 2025 11:11:47 -0600
Subject: [PATCH 1/2] change(web): track right-hand split index for input
 source of tokenized transforms

Build-bot: skip build:web
Test-bot: skip
---
 .../src/main/correction/context-tokenization.ts    | 14 +++++++++++---
 .../src/main/correction/search-path.ts             |  7 ++++++-
 .../src/main/correction/search-space.ts            | 11 +++++++++++
 .../context/context-tokenization.tests.ts          |  6 ++++--
 4 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
index b641e48a1b3..27239dd0506 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -19,6 +19,7 @@ import { TokenizationPath } from './tokenization-subsets.js';
 
 import LexicalModel = LexicalModelTypes.LexicalModel;
 import Transform = LexicalModelTypes.Transform;
+import { PathInputProperties } from './search-space.js';
 
 // May be able to "get away" with 2 & 5 or so, but having extra will likely help
 // with edit path stability.
@@ -561,7 +562,8 @@ export class ContextTokenization {
     }
 
     let appliedLength = 0;
-    for(let tailRelativeIndex of inputTransformKeys) {
+    for(let i = 0; i < inputTransformKeys.length; i++) {
+      const tailRelativeIndex = inputTransformKeys[i];
       let distribution = inputs.map((i) => ({sample: i.sample.get(tailRelativeIndex), p: i.p}));
       const tokenIndex = (tokenization.length - 1) + tailRelativeIndex;
 
@@ -585,7 +587,8 @@ export class ContextTokenization {
       if(affectedToken.inputCount == 0 && distribution[0].sample.deleteLeft != 0) {
         distribution = distribution.map((mass) => ({sample: { ...mass.sample, deleteLeft: 0 }, p: mass.p }));
       }
-      affectedToken.addInput({
+
+      const inputSource: PathInputProperties = {
         segment: {
           trueTransform: sourceInput,
           transitionId: sourceInput.id,
@@ -593,8 +596,13 @@ export class ContextTokenization {
         },
         bestProbFromSet: bestProbFromSet,
         subsetId: tokenizationPath.inputSubsetId
-      }, distribution);
+      };
       appliedLength += KMWString.length(distribution[0].sample.insert);
+      if(i + 1 < inputTransformKeys.length) {
+        inputSource.segment.end = appliedLength;
+      }
+
+      affectedToken.addInput(inputSource, distribution);
 
       const tokenize = determineModelTokenizer(lexicalModel);
       affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false;
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
index 9bf2e478bc8..51a0d606263 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
@@ -471,7 +471,12 @@ export class SearchPath implements SearchSpace {
 
     for(const source of sources) {
       const i = source.segment.start;
-      components.push(`T${source.segment.transitionId}${i != 0 ? '@' + i : ''}`);
+      const j = source.segment.end;
+      let component = (`T${source.segment.transitionId}${i != 0 || j !== undefined  ? '@' + i : ''}`);
+      if(j) {
+        component = component + '-' + j;
+      }
+      components.push(component);
     }
 
     return components.join('+');
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts
index 8884a07a0c6..a3e9f22796e 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts
@@ -60,6 +60,17 @@ export interface InputSegment {
    * tokenized correction-search input.
    */
   start: number
+
+  /**
+   * Marks the final index (exclusive) within the insert strings for the
+   * corresponding transitions' Transforms that are applied by the corresponding
+   * tokenized correction-search input.
+   *
+   * If undefined, there is no portion of the input-source transform split from
+   * the right-hand side.  Otherwise, this value should match the `start` value of
+   * the _next_ split-off component of the input-source.
+   */
+  end?: number;
 }
 
 /**
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts
index 5e50b41f50b..51799a54058 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts
@@ -465,7 +465,8 @@ describe('ContextTokenization', function() {
         segment: {
           trueTransform: inputTransform,
           transitionId: inputTransform.id,
-          start: 0
+          start: 0,
+          end: 0
         }, bestProbFromSet: 1,
         subsetId
       });
@@ -476,7 +477,8 @@ describe('ContextTokenization', function() {
         segment: {
           trueTransform: inputTransform,
           transitionId: inputTransform.id,
-          start: 0
+          start: 0,
+          end: 1 // captured the leading whitespace insert
         }, bestProbFromSet: 1,
         subsetId
       }]);

From b51dbc3dbcaa6f22c0f04d60551a5817d2694802 Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Mon, 10 Nov 2025 16:05:12 -0600
Subject: [PATCH 2/2] change(web): enhance SearchPath.split() unit tests per
 new inputSplitIndex field

---
 .../src/main/correction/search-path.ts        | 10 +++-
 .../worker-thread/src/main/test-index.ts      |  1 +
 .../context/context-token.tests.ts            | 49 ++++++++++++-------
 .../correction-search/search-path.tests.ts    |  8 ++-
 4 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
index 51a0d606263..71e9fa2be0c 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-path.ts
@@ -259,7 +259,7 @@ export class SearchPath implements SearchSpace {
     this.selectionQueue = new PriorityQueue<SearchNode>(QUEUE_NODE_COMPARATOR, entries);
   }
 
-  public split(charIndex: number): [SearchSpace, SearchSpace] {
+  public split(charIndex: number): [SearchSpace, SearchPath] {
     const model = this.model;
     const internalSplitIndex = charIndex - (this.codepointLength - this.edgeLength);
 
@@ -298,7 +298,13 @@ export class SearchPath implements SearchSpace {
       // don't append any part of it to the parent; it's actually clean.
       const hasActualSplit = internalSplitIndex > 0 || this.inputs?.[0].sample.deleteLeft > 0;
       const parent = hasActualSplit
-        ? new SearchPath(this.parentSpace, firstSet, this.inputSource)
+        ? new SearchPath(this.parentSpace, firstSet, {
+          ...this.inputSource,
+          segment: {
+            ...this.inputSource.segment,
+            end: this.inputSource.segment.start + internalSplitIndex
+          }
+        })
         : this.parentSpace;
       // construct two SearchPath instances based on the two sets!
       return [
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts
index 2ec0e8b856b..05f29ce903d 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/test-index.ts
@@ -6,6 +6,7 @@ export { ContextTracker } from './correction/context-tracker.js';
 export { ContextTransition } from './correction/context-transition.js';
 export * from './correction/distance-modeler.js';
 export * from './correction/search-path.js';
+export * from './correction/search-space.js';
 export { ExtendedEditOperation, SegmentableDistanceCalculation } from './correction/segmentable-calculation.js';
 export * from './correction/tokenization-subsets.js';
 export * as correction from './correction/index.js';
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts
index 1a9d39d25b6..ab5938b1608 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-token.tests.ts
@@ -14,7 +14,7 @@ import { default as defaultBreaker } from '@keymanapp/models-wordbreakers';
 import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs';
 import { LexicalModelTypes } from '@keymanapp/common-types';
 
-import { ContextToken, correction, generateSubsetId, getBestMatches, models, preprocessInputSources, SearchPath } from '@keymanapp/lm-worker/test-index';
+import { ContextToken, correction, generateSubsetId, getBestMatches, models, PathInputProperties, preprocessInputSources, SearchPath } from '@keymanapp/lm-worker/test-index';
 
 import Distribution = LexicalModelTypes.Distribution;
 import ExecutionTimer = correction.ExecutionTimer;
@@ -464,20 +464,29 @@ describe('ContextToken', function() {
 
       assert.equal(resultsOfSplit.length, 3);
       assert.sameOrderedMembers(resultsOfSplit.map(t => t.exampleInput), splitTextArray);
-      assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputSegments[0]), [0, 3, 8].map(i => ({
-        segment: {
-          trueTransform: {
-            insert: 'biglargetransform',
-            id: 13,
-            deleteLeft: 0,
-            deleteRight: 0
+      const offsets = [0, 3, 8];
+      assert.sameDeepOrderedMembers(resultsOfSplit.map(t => t.inputSegments[0]), [0, 1, 2].map(i => {
+        const inputSource: PathInputProperties = {
+          segment: {
+            trueTransform: {
+              insert: 'biglargetransform',
+              id: 13,
+              deleteLeft: 0,
+              deleteRight: 0
+            },
+            transitionId: 13,
+            start: offsets[i]
           },
-          transitionId: 13,
-          start: i
-        },
-        bestProbFromSet: 1,
-        subsetId
-      })));
+          bestProbFromSet: 1,
+          subsetId
+        };
+
+        if(offsets[i+1] !== undefined) {
+          inputSource.segment.end = offsets[i+1];
+        }
+
+        return inputSource;
+      }));
 
       for(let i = 0; i < resultsOfSplit.length; i++) {
         assert.isTrue(resultsOfSplit[i].searchSpace.hasInputs([
@@ -549,7 +558,8 @@ describe('ContextToken', function() {
           segment: {
             trueTransform: keystrokeDistributions[1][0].sample,
             transitionId: keystrokeDistributions[1][0].sample.id,
-            start: 0
+            start: 0,
+            end: 'arge'.length
           },
           bestProbFromSet: 1,
           subsetId: subsetIds[1]
@@ -568,7 +578,8 @@ describe('ContextToken', function() {
           segment: {
             trueTransform: keystrokeDistributions[2][0].sample,
             transitionId: keystrokeDistributions[2][0].sample.id,
-            start: 0
+            start: 0,
+            end: 'ng'.length
           },
           bestProbFromSet: 1,
           subsetId: subsetIds[2]
@@ -693,7 +704,8 @@ describe('ContextToken', function() {
           segment: {
             trueTransform: keystrokeDistributions[1][0].sample,
             transitionId: keystrokeDistributions[1][0].sample.id,
-            start: 0
+            start: 0,
+            end: 'arge'.length
           },
           bestProbFromSet: 1,
           subsetId: subsetIds[1]
@@ -711,7 +723,8 @@ describe('ContextToken', function() {
           segment: {
             trueTransform: keystrokeDistributions[2][0].sample,
             transitionId: keystrokeDistributions[2][0].sample.id,
-            start: 0
+            start: 0,
+            end: 'ng'.length
           },
           bestProbFromSet: 1,
           subsetId: subsetIds[2]
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts
index a2a5760d30b..4a832d6feaa 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/search-path.tests.ts
@@ -1361,7 +1361,13 @@ describe('SearchPath', () => {
       assert.isTrue(tail instanceof SearchPath);
       assert.deepEqual((head as SearchPath).inputs, headTarget.inputs);
       assert.deepEqual((tail as SearchPath).inputs, tailTarget.inputs);
-      assert.deepEqual((head as SearchPath).inputSource, headTarget.inputSource);
+      assert.deepEqual((head as SearchPath).inputSource, {
+        ...headTarget.inputSource,
+        segment: {
+          ...headTarget.inputSource.segment,
+          end: 2
+        }
+      });
       assert.deepEqual((tail as SearchPath).inputSource, tailTarget.inputSource);
     });
   });