Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
* in the context and associated correction-search progress and results.
*/

import { buildMergedTransform } from "@keymanapp/models-templates";
import { LexicalModelTypes } from '@keymanapp/common-types';
import { deepCopy, KMWString } from "@keymanapp/web-utils";

Expand Down Expand Up @@ -183,59 +182,19 @@ export class ContextToken {
* @param lexicalModel
* @returns
*/
static merge(tokensToMerge: ContextToken[], lexicalModel: LexicalModel): ContextToken {
static merge(tokensToMerge: ContextToken[]): ContextToken {
if(tokensToMerge.length < 1) {
return null;
}

// Assumption: if we're merging a token, it's not whitespace.
// Thus, we don't set the .isWhitespace flag field.
const resultToken = new ContextToken(lexicalModel);

let lastSourceInput: PathInputProperties;
let lastInputDistrib: Distribution<Transform>;
for(const token of tokensToMerge) {
const inputCount = token.inputCount;
let startIndex = 0;

if(inputCount == 0) {
continue;
}

// Are we re-merging on a previously split transform?
if(lastSourceInput?.segment.trueTransform != token.inputSegments[0].segment.trueTransform) {
if(lastSourceInput) {
resultToken.addInput(lastSourceInput, lastInputDistrib);
} // else: there's nothing to add as input
} else {
// If so, re-merge it!
startIndex++;

lastInputDistrib = lastInputDistrib?.map((entry, index) => {
return {
sample: buildMergedTransform(entry.sample, token.searchSpace.inputSequence[0][index].sample),
p: entry.p
}
});

// In case there's only one input that needs merging on both ends.
if(inputCount == 1) {
// There's potential that the next incoming token needs to merge with this.
continue;
} else {
resultToken.addInput(lastSourceInput, lastInputDistrib);
}
}
lastSourceInput = null;
lastInputDistrib = null;

// Ignore the last entry for now - it may need to merge with a matching
// entry in the next token!
for(let i = startIndex; i < inputCount - 1; i++) {
resultToken.addInput(token.inputSegments[i], token.searchSpace.inputSequence[i]);
}
lastSourceInput = token.inputSegments[inputCount-1];
lastInputDistrib = token.searchSpace.inputSequence[inputCount-1];
const resultToken = new ContextToken(tokensToMerge.shift());
while(tokensToMerge.length > 0) {
const next = tokensToMerge.shift();
resultToken._searchSpace = resultToken._searchSpace.merge(next._searchSpace);
}

resultToken.addInput(lastSourceInput, lastInputDistrib);

return resultToken;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ export class ContextTokenization {
// consider: move to ContextToken as class method. (static?)
const merge = merges.shift();
const tokensToMerge = merge.inputs.map((m) => baseTokenization[m.index]);
const mergeResult = ContextToken.merge(tokensToMerge, lexicalModel);
const mergeResult = ContextToken.merge(tokensToMerge);
tokenization.push(mergeResult);
i = merge.inputs[merge.inputs.length - 1].index;
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import { QueueComparator as Comparator, KMWString, PriorityQueue } from '@keymanapp/web-utils';
import { LexicalModelTypes } from '@keymanapp/common-types';
import { buildMergedTransform } from '@keymanapp/models-templates';

import { EDIT_DISTANCE_COST_SCALE, SearchNode, SearchResult } from './distance-modeler.js';
import { generateSpaceSeed, PathResult, SearchSpace, PathInputProperties } from './search-space.js';
Expand Down Expand Up @@ -148,19 +149,6 @@ export class SearchPath implements SearchSpace {
this.bestProbInEdge = 1;
}

/**
* Retrieves the sequences of inputs that led to this SearchPath.
*/
public get inputSequence(): Distribution<Transform>[] {
if(this.parents[0]) {
return [...this.parents[0].inputSequence, this.inputs];
} else if(this.inputs) {
return [this.inputs];
} else {
return [];
}
}

public get constituentPaths(): SearchPath[][] {
const parentPaths = this.parents[0]?.constituentPaths ?? [];
if(parentPaths.length > 0) {
Expand Down Expand Up @@ -256,6 +244,78 @@ export class SearchPath implements SearchSpace {
this.selectionQueue = new PriorityQueue<SearchNode>(QUEUE_NODE_COMPARATOR, entries);
}

// spaces are in sequence here.
// `this` = head 'space'.
public merge(space: SearchSpace): SearchSpace {
// Head node for the incoming path is empty, so skip it.
if(space.parents.length == 0) {
return this;
}

// Merge any parents first as a baseline. We have to come after their
// affects are merged in, anyway.
const parentMerges = space.parents?.length > 0 ? space.parents.map((p) => this.merge(p)) : [this];

// if parentMerges.length > 0, is a SearchCluster.
const parentMerge = parentMerges[0];

// Special case: if we've reached the head of the space to be merged, check
// for a split transform.
// - we return `this` from the root, so if that's what we received, we're
// on the first descendant - the first path component.
if(space instanceof SearchPath) {
if(parentMerge != this) {
return new SearchPath(parentMerge, space.inputs, space.inputSource);
}

const localInputId = this.inputSource?.segment.transitionId;
const spaceInputId = space.inputSource?.segment.transitionId;
// The 'id' may be undefined in some unit tests and for tokens
// reconstructed after a backspace. In either case, we consider the
// related results as fully separate; our reconstructions are
// per-codepoint.
if(localInputId != spaceInputId || localInputId === undefined) {
return new SearchPath(parentMerge, space.inputs, space.inputSource);
}
// Get the twin halves that were split.
// Assumption: the two halves are in their original order, etc.
const localInputs = this.inputs;
const spaceInputs = space.inputs;

// Sanity check - ensure that the input distributions have the same length;
// if not, this shouldn't represent a SearchPath split!
if(localInputs.length != spaceInputs.length) {
return new SearchPath(parentMerge, space.inputs, space.inputSource);
}

// Merge them!
const mergedInputs = localInputs?.map((entry, index) => {
return {
sample: buildMergedTransform(entry.sample, spaceInputs[index].sample),
p: entry.p
}
});

// Now to re-merge the two halves.
const mergedInputSource = {
...this.inputSource,
segment: {
...this.inputSource.segment,
end: space.inputSource.segment.end
}
};

if(mergedInputSource.segment.end == undefined) {
delete mergedInputSource.segment.end;
}

return new SearchPath(this.parentSpace, mergedInputs, mergedInputSource);
} else {
// If the parent was a cluster, the cluster itself is the merge.
return parentMerge;
}
}

public split(charIndex: number): [SearchSpace, SearchPath] {
const model = this.model;
const internalSplitIndex = charIndex - (this.codepointLength - this.edgeLength);
Expand Down Expand Up @@ -484,4 +544,40 @@ export class SearchPath implements SearchSpace {

return components.join('+');
}

isSameSpace(space: SearchSpace): boolean {
// Easiest cases: when the instances or their ' `spaceId` matches, we have
// a perfect match.
if(this == space || this.spaceId == space.spaceId) {
return true;
}

// If it's falsy or a different SearchSpace type, that's an easy filter.
if(!space || !(space instanceof SearchPath)) {
return false;
}

// If the most recent 'input source' was not triggered from the same input
// subset, it's not a match.
if(this.inputSource?.subsetId != space.inputSource?.subsetId) {
return false;
}

// We check the indices of the input's split if one occurred.
if(this.inputSource?.segment.end != space.inputSource?.segment.end) {
return false;
}

if(this.inputSource?.segment.start != space.inputSource?.segment.start) {
return false;
}

return true;

// Commented out b/c parentSpace-checks cause unit-test ID issues after... a... split.
//
// // Finally, we recursively verify that the parent matches. If there IS no parent,
// // we verify that _that_ aspect matches.
// return this.parentSpace?.isSameSpace(space.parentSpace) ?? this.parentSpace == space.parentSpace;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,6 @@ export interface SearchSpace {
*/
readonly inputCount: number;

/**
* Retrieves the sequence of inputs that led to this SearchSpace.
*
* THIS WILL BE REMOVED SHORTLY in favor of `constituentPaths` below, which
* provides an improved view into the data and models multiple paths to the
* space when they exist. (Once SearchPath takes on merging & splitting)
*/
readonly inputSequence: Distribution<Transform>[];

/**
* Reports the length in codepoints of corrected text represented by completed
* paths from this instance.
Expand All @@ -225,6 +216,15 @@ export interface SearchSpace {
*/
get sourceRangeKey(): string;

/**
* Appends this SearchSpace with the provided SearchSpace's search properties,
* extending the represented search range accordingly. If this operation
* represents merging the result of a previous .split() call, the two halves
* of any split input components will be fully re-merged.
* @param space
*/
merge(space: SearchSpace): SearchSpace;

/**
* Splits this SearchSpace into two halves at the specified codepoint index.
* The 'head' component will maximally re-use existing cached data, while the
Expand All @@ -240,4 +240,6 @@ export interface SearchSpace {
* Intended only for use during unit testing.
*/
readonly constituentPaths: SearchPath[][];

isSameSpace(space: SearchSpace): boolean;
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ describe('ContextToken', function() {
const token2 = new ContextToken(plainModel, "'");
const token3 = new ContextToken(plainModel, "t");

const merged = ContextToken.merge([token1, token2, token3], plainModel);
const merged = ContextToken.merge([token1, token2, token3]);
assert.equal(merged.exampleInput, "can't");
token1.inputSegments.forEach((entry) => assert.isTrue(merged.inputSegments.indexOf(entry) > -1));
token2.inputSegments.forEach((entry) => assert.isTrue(merged.inputSegments.indexOf(entry) > -1));
Expand Down Expand Up @@ -155,7 +155,7 @@ describe('ContextToken', function() {
subsetId: srcSubsetId
}, [{sample: {insert: 't', deleteLeft: 0, deleteRight: 0, id: 1}, p: 1}]);

const merged = ContextToken.merge([token1, token2, token3], plainModel);
const merged = ContextToken.merge([token1, token2, token3]);
assert.equal(merged.exampleInput, "can't");
assert.deepEqual(merged.inputSegments, [ {
segment: {
Expand Down Expand Up @@ -253,7 +253,7 @@ describe('ContextToken', function() {
subsetId: srcSubsetIds[3]
}, [{sample: srcTransforms[3], p: 1}]);

const merged = ContextToken.merge(tokensToMerge, plainModel);
const merged = ContextToken.merge(tokensToMerge);
assert.equal(merged.exampleInput, "applesandsourgrapes");
assert.deepEqual(merged.inputSegments, srcTransforms.map((t, i) => ({
segment: {
Expand Down Expand Up @@ -352,7 +352,7 @@ describe('ContextToken', function() {
subsetId: srcSubsetIds[3]
}, [{sample: srcTransforms[3], p: 1}]);

const merged = ContextToken.merge(tokensToMerge, plainModel);
const merged = ContextToken.merge(tokensToMerge);
assert.equal(merged.exampleInput, toMathematicalSMP("applesandsourgrapes"));
assert.deepEqual(merged.inputSegments, srcTransforms.map((t, i) => ({
segment: {
Expand Down
Loading