Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ export class ContextState {
const baseTokenization = startTokenizationsAfterSlide[0];
// For multiple tokenizations, we'd retrieve each, use the "most likely" one as base,
// and then fold all resulting search spaces (on the final token) into one.
const tokenizationAnalysis = trueInputSubset.pendingSet.get(baseTokenization);
const tokenizationAnalysis = trueInputSubset.transitionPaths.get(baseTokenization);

// Determine the best probability from among ALL available inputs, before they're split
// into subsets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import TransformUtils from '../transformUtils.js';
import { computeDistance, EditOperation, EditTuple } from './classical-calculation.js';
import { determineModelTokenizer } from '../model-helpers.js';
import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js';
import { PendingTokenization } from './tokenization-subsets.js';
import { TokenizationPath } from './tokenization-subsets.js';

import LexicalModel = LexicalModelTypes.LexicalModel;
import Transform = LexicalModelTypes.Transform;
Expand Down Expand Up @@ -108,7 +108,7 @@ export class ContextTokenization {
* The tokenization-transition metadata relating this instance to the most likely
* tokenization from a prior state.
*/
readonly transitionEdits?: PendingTokenization;
readonly transitionEdits?: TokenizationPath;

/**
* The portion of edits from the true input keystroke that are not part of the
Expand All @@ -125,10 +125,10 @@ export class ContextTokenization {

constructor(priorToClone: ContextTokenization);
constructor(tokens: ContextToken[]);
constructor(tokens: ContextToken[], alignment: PendingTokenization, taillessTrueKeystroke: Transform);
constructor(tokens: ContextToken[], alignment: TokenizationPath, taillessTrueKeystroke: Transform);
constructor(
param1: ContextToken[] | ContextTokenization,
alignment?: PendingTokenization,
alignment?: TokenizationPath,
taillessTrueKeystroke?: Transform
) {
if(!(param1 instanceof ContextTokenization)) {
Expand Down Expand Up @@ -490,7 +490,7 @@ export class ContextTokenization {
* Given results from `precomputeTokenizationAfterInput`, this method will
* evaluate the pending transition in tokenization for all associated inputs
* while reusing as many correction-search intermediate results as possible.
* @param pendingTokenization Batched results from one or more
* @param tokenizationPath Batched results from one or more
* `precomputeTokenizationAfterInput` calls on this instance, all with the
* same alignment values.
* @param lexicalModel The active lexical model
Expand All @@ -499,16 +499,16 @@ export class ContextTokenization {
* @param bestProbFromSet The probability of the single most likely input
* transform in the overall transformDistribution associated with the
* keystroke triggering theh transition. It need not be represented by the
* pendingTokenization to be built.
* tokenizationPath to be built.
* @returns
*/
evaluateTransition(
pendingTokenization: PendingTokenization,
tokenizationPath: TokenizationPath,
lexicalModel: LexicalModel,
sourceInput: Transform,
bestProbFromSet: number
): ContextTokenization {
const { alignment: alignment, inputs } = pendingTokenization;
const { alignment: alignment, inputs } = tokenizationPath;
const sliceIndex = alignment.edgeWindow.sliceIndex;
const baseTokenization = this.tokens.slice(sliceIndex);
let affectedToken: ContextToken;
Expand Down Expand Up @@ -592,7 +592,7 @@ export class ContextTokenization {
start: appliedLength
},
bestProbFromSet: bestProbFromSet,
subsetId: pendingTokenization.inputSubsetId
subsetId: tokenizationPath.inputSubsetId
}, distribution);
appliedLength += KMWString.length(distribution[0].sample.insert);

Expand All @@ -605,7 +605,7 @@ export class ContextTokenization {
return new ContextTokenization(
this.tokens.slice(0, sliceIndex).concat(tokenization),
null /* tokenMapping */,
determineTaillessTrueKeystroke(pendingTokenization)
determineTaillessTrueKeystroke(tokenizationPath)
);
}
}
Expand Down Expand Up @@ -1122,7 +1122,7 @@ export function assembleTransforms(stackedInserts: string[], stackedDeletes: num
* @param tokenizationAnalysis
* @returns
*/
export function determineTaillessTrueKeystroke(tokenizationAnalysis: PendingTokenization) {
export function determineTaillessTrueKeystroke(tokenizationAnalysis: TokenizationPath) {
// undefined by default; we haven't yet determined if we're still affecting
// the same token that was the tail in the previous tokenization state.
let taillessTrueKeystroke: Transform;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export interface PathInputProperties {
* This tends to serve as an identifying factor for tokenized input distributions,
* indicating the distributions were all sourced from the same original input event.
*
* @see PendingTokenization.inputSubsetId
* @see TokenizationPath.inputSubsetId
*/
subsetId: number;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,29 @@ export function generateSubsetId() {
return SUBSET_ID_SEED++;
}

export interface PendingTokenization {
/**
* Tracks metadata about the "path" for transitioning from one source
* ContextTokenization to a potentially-common destination ContextTokenization.
*
* Once evaluated, each entry within its `.inputs` field should have a
* one-to-one relationship with instances of the `SearchPath` class.
*/
export interface TokenizationPath {
/**
* The edge window corresponding to the common tokenization for the subset's inputs
* The edge window corresponding to the common ContextTokenization context
* to which this path's inputs will be applied.
*/
alignment: TokenizationEdgeAlignment,

/**
* A set of incoming keystrokes with compatible effects when applied.
*
* If passed to `subsetByInterval`, the transforms should result in a single subset.
* If passed to the`subsetByInterval`, the transforms should result in a single subset.
*/
inputs: Distribution<Map<number, Transform>>

/**
* A unique identifier associated with this PendingTokenization and its
* A unique identifier associated with this TokenizationPath and its
* transforms within `SearchSpace`s. This ID assists with detecting when
* split transforms are re-merged during SearchSpace merges. Only
* input-sources with matching subset ID come from the same subset, and thus
Expand All @@ -42,7 +50,17 @@ export interface PendingTokenization {
}

/**
* Defines a subset of pending tokenization transitions based on potential inputs.
* Defines a subset of pending tokenization transitions based on potential
* inputs.
*
* If more than one `transitionPaths` entry exists, this should directly
* correspond to a unique instance of `SearchCluster` (per affected
* `ContextToken`) once fully processed, each comprised of the corresponding
* `SearchPath` entries constructed from each `transitionPaths` entry.
*
* If only one `transitionPaths` entry exists, it should correspond to
* `SearchPath` instances instead; there is no need for `SearchCluster` overhead
* in such cases.
*/
export interface TokenizationSubset {
/**
Expand All @@ -55,7 +73,7 @@ export interface TokenizationSubset {
* them, yielding compatible search paths and tokenization effects after their
* application.
*/
readonly pendingSet: Map<ContextTokenization, PendingTokenization>;
readonly transitionPaths: Map<ContextTokenization, TokenizationPath>;
}

export function editKeyer(precomputation: TokenizationTransitionEdits): string[] {
Expand Down Expand Up @@ -213,13 +231,13 @@ export class TokenizationSubsetBuilder {
// Maps any number of Tokenizations and their incoming alignment data to a common key
// for final tokenization forms.
const entry: TokenizationSubset = this._subsets.get(key) ?? {
pendingSet: new Map(),
transitionPaths: new Map(),
key: key
}

// Finds any previously-accumulated data corresponding to both the incoming and
// target final tokenization form, creating an empty entry if none yet exists.
const forTokenization: PendingTokenization = entry.pendingSet.get(tokenization) ?? {
const forTokenization: TokenizationPath = entry.transitionPaths.get(tokenization) ?? {
alignment: precomputation.alignment,
inputs: [],
inputSubsetId: generateSubsetId()
Expand All @@ -228,7 +246,7 @@ export class TokenizationSubsetBuilder {
// Adds the incoming tokenized transform data for the pairing...
forTokenization.inputs.push({sample: precomputation.tokenizedTransform, p});
// and ensures that the pairing's data-accumulator is in the map.
entry.pendingSet.set(tokenization, forTokenization);
entry.transitionPaths.set(tokenization, forTokenization);

// Also ensures that the target tokenization's data (accumulating the pairings)
// is made available within the top-level map.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs'
import { LexicalModelTypes } from '@keymanapp/common-types';
import { KMWString } from '@keymanapp/web-utils';

import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, generateSubsetId, models, PendingTokenization, SearchPath, traceInsertEdits } from '@keymanapp/lm-worker/test-index';
import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, generateSubsetId, models, TokenizationPath, SearchPath, traceInsertEdits } from '@keymanapp/lm-worker/test-index';

import Transform = LexicalModelTypes.Transform;
import TrieModel = models.TrieModel;
Expand Down Expand Up @@ -96,7 +96,7 @@ describe('ContextTokenization', function() {

// We _could_ flesh this out a bit more... but it's not really needed for this test.
const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec);
let transitionEdits: PendingTokenization = {
let transitionEdits: TokenizationPath = {
alignment: {
merges: [],
splits: [],
Expand Down Expand Up @@ -129,7 +129,7 @@ describe('ContextTokenization', function() {

// We _could_ flesh this out a bit more... but it's not really needed for this test.
const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec);
let transitionEdits: PendingTokenization = {
let transitionEdits: TokenizationPath = {
alignment: {
merges: [],
splits: [],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -618,8 +618,8 @@ describe('TokenizationSubsetBuilder', function() {

assert.equal(subsetBuilder.subsets.size, 1); // All transforms have similar impacts.
const subset = [...subsetBuilder.subsets.values()][0];
assert.equal(subset.pendingSet.size, 1); // Built from only one tokenization
assert.deepEqual(subset.pendingSet.get(baseTokenization).inputs,
assert.equal(subset.transitionPaths.size, 1); // Built from only one tokenization
assert.deepEqual(subset.transitionPaths.get(baseTokenization).inputs,
inputDistribution.map((sample) => {
const map = new Map<number, Transform>();
map.set(0, sample.sample);
Expand All @@ -646,18 +646,18 @@ describe('TokenizationSubsetBuilder', function() {

assert.equal(subsetBuilder.subsets.size, 2); // All transforms have similar impacts.
const subsets = [...subsetBuilder.subsets.values()];
subsets.forEach((subset) => assert.equal(subset.pendingSet.size, 1)); // Built from only one tokenization
subsets.forEach((subset) => assert.equal(subset.transitionPaths.size, 1)); // Built from only one tokenization

const distributionWithoutWhitespace = inputDistribution.slice(0, inputDistribution.length-1);
const extendingSubset = subsets.find((subset) => subset.pendingSet.get(baseTokenization).inputs.length > 1);
assert.deepEqual(extendingSubset.pendingSet.get(baseTokenization).inputs,
const extendingSubset = subsets.find((subset) => subset.transitionPaths.get(baseTokenization).inputs.length > 1);
assert.deepEqual(extendingSubset.transitionPaths.get(baseTokenization).inputs,
distributionWithoutWhitespace.map((sample) => {
const map = new Map<number, Transform>();
map.set(0, sample.sample);
return { sample: map, p: sample.p };
}));

const whitespaceSubset = subsets.find((subset) => subset.pendingSet.get(baseTokenization).inputs.length == 1);
const whitespaceSubset = subsets.find((subset) => subset.transitionPaths.get(baseTokenization).inputs.length == 1);
const whitespaceSample = inputDistribution[inputDistribution.length - 1];
const expectedWhitespaceTransformTokenization = {
sample: (() => {
Expand All @@ -670,7 +670,7 @@ describe('TokenizationSubsetBuilder', function() {
})(),
p: whitespaceSample.p
};
assert.deepEqual(whitespaceSubset.pendingSet.get(baseTokenization).inputs, [expectedWhitespaceTransformTokenization]);
assert.deepEqual(whitespaceSubset.transitionPaths.get(baseTokenization).inputs, [expectedWhitespaceTransformTokenization]);
});

it("builds different subsets for transforms resulting in different total lengths and token count", () => {
Expand Down Expand Up @@ -698,54 +698,54 @@ describe('TokenizationSubsetBuilder', function() {

const subsets = [...subsetBuilder.subsets.values()];
const sameTokenLen4Subset = subsets.find((subset) => {
const dataForSet = subset.pendingSet.get(baseTokenization);
const dataForSet = subset.transitionPaths.get(baseTokenization);
const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0);
// Thanks, floating-point precision.
// Should land both the 'é' (delete 1) and empty-string transform (that lacks deletes)
return Math.abs(totalMass - .45) < 1e-8;
});
assert.isOk(sameTokenLen4Subset);
assert.equal(sameTokenLen4Subset.pendingSet.get(baseTokenization).inputs.length, 2);
assert.equal(sameTokenLen4Subset.transitionPaths.get(baseTokenization).inputs.length, 2);

const sameTokenLen5Subset = subsets.find((subset) => {
const dataForSet = subset.pendingSet.get(baseTokenization);
const dataForSet = subset.transitionPaths.get(baseTokenization);
const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0);
// Thanks, floating-point precision.
// Should land both the 't' and 's' transforms: adds 1 char, deletes none
return Math.abs(totalMass - .35) < 1e-8;
});
assert.isOk(sameTokenLen5Subset);
assert.equal(sameTokenLen5Subset.pendingSet.get(baseTokenization).inputs.length, 2);
assert.equal(sameTokenLen5Subset.transitionPaths.get(baseTokenization).inputs.length, 2);

const sameTokenLen3Subset = subsets.find((subset) => {
const dataForSet = subset.pendingSet.get(baseTokenization);
const dataForSet = subset.transitionPaths.get(baseTokenization);
const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0);
// Thanks, floating-point precision.
// Should land the backspace transform.
return Math.abs(totalMass - .1) < 1e-8;
});
assert.isOk(sameTokenLen3Subset);
assert.equal(sameTokenLen3Subset.pendingSet.get(baseTokenization).inputs.length, 1);
assert.equal(sameTokenLen3Subset.transitionPaths.get(baseTokenization).inputs.length, 1);

const plusOneTokenSubset = subsets.find((subset) => {
const dataForSet = subset.pendingSet.get(baseTokenization);
const dataForSet = subset.transitionPaths.get(baseTokenization);
const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0);
// Thanks, floating-point precision.
// Should land the backspace transform.
return Math.abs(totalMass - .08) < 1e-8;
});
assert.isOk(plusOneTokenSubset);
assert.equal(plusOneTokenSubset.pendingSet.get(baseTokenization).inputs.length, 1);
assert.equal(plusOneTokenSubset.transitionPaths.get(baseTokenization).inputs.length, 1);

const plusTwoTokensSubset = subsets.find((subset) => {
const dataForSet = subset.pendingSet.get(baseTokenization);
const dataForSet = subset.transitionPaths.get(baseTokenization);
const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0);
// Thanks, floating-point precision.
// Should land the backspace transform.
return Math.abs(totalMass - .12) < 1e-8;
});
assert.isOk(plusTwoTokensSubset);
assert.equal(plusTwoTokensSubset.pendingSet.get(baseTokenization).inputs.length, 1);
assert.equal(plusTwoTokensSubset.transitionPaths.get(baseTokenization).inputs.length, 1);
});

it("places compatible results from separate tokenizations in the same subset after whitespace", () => {
Expand Down Expand Up @@ -801,7 +801,7 @@ describe('TokenizationSubsetBuilder', function() {
// consider their paths separately after the transition.
assert.equal(subsetBuilder.subsets.size, 1);
// Has entries from two different base tokenizations.
assert.equal([...subsetBuilder.subsets.values()][0].pendingSet.size, 2);
assert.equal([...subsetBuilder.subsets.values()][0].transitionPaths.size, 2);
});

it("places compatible results from separate tokenizations in the same subset (mid-token)", () => {
Expand Down Expand Up @@ -865,9 +865,9 @@ describe('TokenizationSubsetBuilder', function() {

// sé + an, sea + n: both result in a four-char long token starting at the same point.
// Same total amount of .deleteLeft is supported for both variations.
const mergedSubset = subsets.find((subset) => subset.pendingSet.size);
const mergedSubset = subsets.find((subset) => subset.transitionPaths.size);
assert.isOk(mergedSubset);
assert.isTrue(mergedSubset.pendingSet.has(twoCharTokenization));
assert.isTrue(mergedSubset.pendingSet.has(threeCharTokenization));
assert.isTrue(mergedSubset.transitionPaths.has(twoCharTokenization));
assert.isTrue(mergedSubset.transitionPaths.has(threeCharTokenization));
});
});