Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,87 @@
public class SourceLinesDiffFinder {
private static final Logger LOG = LoggerFactory.getLogger(SourceLinesDiffFinder.class);

/**
* Upper bound on {@code core_left * core_right} cells of work permitted inside
* {@link MyersDiff#buildPath(List, List)} after common-prefix / common-suffix trimming.
* Myers diff cost is O(D * (N + M)); when the divergent cores are large and disjoint,
* D approaches N + M and the algorithm collapses to O(N^2). 4 000 000 cells caps the
* worst-case in-memory work at roughly 1 s on production hardware.
*/
static final long DIFF_COMPLEXITY_THRESHOLD = 4_000_000L;

/**
* Upper bound on {@code max(core_left, core_right) / min(core_left, core_right)} after
* common-prefix / common-suffix trimming. When one side is much larger than the other,
* the edit distance D is forced to be at least {@code |N - M|} and Myers diff has to
* touch ~D * (N + M) cells regardless of content. The asymmetric shape is the signature
* of "small scanner delta against a large reference-branch file" (e.g. ARM EZ-Commit
* sending 50 changed lines against a 30 000-line Salesforce metadata file).
*/
static final int DIFF_ASYMMETRY_RATIO = 100;

/**
* Floor below which the asymmetry ratio is not enforced. For small files the absolute
* cost is negligible no matter the ratio (e.g. 100 x 1 has ratio 100 but completes in
* microseconds), so this avoids tripping the gate on trivial inputs.
*/
static final int DIFF_ASYMMETRY_MIN_SIZE = 5_000;

public int[] findMatchingLines(List<String> left, List<String> right) {
int[] index = new int[right.size()];
int n = left.size();
int m = right.size();
int[] index = new int[m];

// 1. Trim common prefix (cheap: equal hashes only). Prefix lines map 1:1 to DB.
int prefix = 0;
int maxPrefix = Math.min(n, m);
while (prefix < maxPrefix && left.get(prefix).equals(right.get(prefix))) {
index[prefix] = prefix + 1;
prefix++;
}

// 2. Trim common suffix. Suffix lines map 1:1 to the tail of the DB.
int suffix = 0;
int maxSuffix = Math.min(n, m) - prefix;
while (suffix < maxSuffix
&& left.get(n - 1 - suffix).equals(right.get(m - 1 - suffix))) {
index[m - 1 - suffix] = n - suffix;
suffix++;
}

int leftCore = n - prefix - suffix;
int rightCore = m - prefix - suffix;

// 3. If either core is empty the remaining mapping is trivially zero (no possible
// matches) — return what prefix/suffix already produced.
if (leftCore == 0 || rightCore == 0) {
return index;
}

// 4. Apply gates against the CORE sizes (not the raw inputs). We only refuse work
// that is *forced* to be expensive — never near-identical large files whose prefix
// and suffix trim away most of the bulk.
if ((long) leftCore * rightCore > DIFF_COMPLEXITY_THRESHOLD) {
LOG.warn("Skipping Myers diff: divergent core {}x{} (full input {}x{}) exceeds complexity threshold {}; treating unmatched report lines as new.",
leftCore, rightCore, n, m, DIFF_COMPLEXITY_THRESHOLD);
return index;
}
int maxCore = Math.max(leftCore, rightCore);
int minCore = Math.min(leftCore, rightCore);
if (maxCore >= DIFF_ASYMMETRY_MIN_SIZE && maxCore / minCore > DIFF_ASYMMETRY_RATIO) {
LOG.warn("Skipping Myers diff: asymmetric divergent core {}x{} (full input {}x{}, ratio {}) exceeds ratio threshold {}; treating unmatched report lines as new.",
leftCore, rightCore, n, m, maxCore / minCore, DIFF_ASYMMETRY_RATIO);
return index;
}

// 5. Run Myers on the divergent cores only.
List<String> leftCoreList = left.subList(prefix, prefix + leftCore);
List<String> rightCoreList = right.subList(prefix, prefix + rightCore);

int dbLine = left.size();
int reportLine = right.size();
int dbLine = leftCore;
int reportLine = rightCore;
try {
PathNode node = new MyersDiff<String>().buildPath(left, right);
PathNode node = new MyersDiff<String>().buildPath(leftCoreList, rightCoreList);

while (node.prev != null) {
PathNode prevNode = node.prev;
Expand All @@ -46,9 +120,9 @@ public int[] findMatchingLines(List<String> left, List<String> right) {
// removals
dbLine -= (node.i - prevNode.i);
} else {
// matches
// matches — translate core positions back into full-input coordinates
for (int i = node.i; i > prevNode.i; i--) {
index[reportLine - 1] = dbLine;
index[prefix + reportLine - 1] = prefix + dbLine;
reportLine--;
dbLine--;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,162 @@ public void shouldIgnoreDeletedLinesAtTheStartOfTheFile() {

assertThat(diff).containsExactly(3, 4);
}

/**
* Large fully-identical inputs (100K x 100K) — prefix trim consumes everything,
* Myers diff is never invoked. Verifies that identity is returned via the trim path.
*/
@Test
public void shouldReturnIdentityForLargeIdenticalInputsViaPrefixTrim() {
int size = 100_000;
List<String> database = buildLines(size, "line-");
List<String> report = buildLines(size, "line-");

long start = System.nanoTime();
int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
long elapsedMs = (System.nanoTime() - start) / 1_000_000;

assertThat(diff).hasSize(size);
for (int i = 0; i < size; i++) {
assertThat(diff[i]).as("line " + i).isEqualTo(i + 1);
}
assertThat(elapsedMs).as("prefix trim must short-circuit Myers — wall time was " + elapsedMs + " ms")
.isLessThan(2_000L);
}

/**
* Large near-identical inputs (80K x 80K, 100 lines changed in the middle).
* Prefix and suffix together trim ~79 800 lines; Myers diff runs only on the
* 100x100 core. Result must be correct AND fast.
*/
@Test
public void shouldRunMyersOnSmallCoreForLargeNearIdenticalInputs() {
int total = 80_000;
int prefixLen = 39_950;
int divergent = 100;

List<String> database = new ArrayList<>(total);
List<String> report = new ArrayList<>(total);
for (int i = 0; i < prefixLen; i++) {
String shared = "shared-" + i;
database.add(shared);
report.add(shared);
}
for (int i = 0; i < divergent; i++) {
database.add("db-divergent-" + i);
report.add("rp-divergent-" + i);
}
int suffixLen = total - prefixLen - divergent;
for (int i = 0; i < suffixLen; i++) {
String shared = "tail-" + i;
database.add(shared);
report.add(shared);
}

long start = System.nanoTime();
int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
long elapsedMs = (System.nanoTime() - start) / 1_000_000;

assertThat(diff).hasSize(total);
for (int i = 0; i < prefixLen; i++) {
assertThat(diff[i]).as("prefix line " + i).isEqualTo(i + 1);
}
for (int i = prefixLen; i < prefixLen + divergent; i++) {
assertThat(diff[i]).as("divergent line " + i).isZero();
}
for (int i = prefixLen + divergent; i < total; i++) {
assertThat(diff[i]).as("suffix line " + i).isEqualTo(i + 1);
}
assertThat(elapsedMs).as("trim should leave only a 100x100 core — wall time was " + elapsedMs + " ms")
.isLessThan(2_000L);
}

/**
* Asymmetric customer scenario: 30 000-line DB file vs 50-line scanner delta
* with no overlapping content (ARM EZ-Commit pattern). Prefix/suffix trim
* removes nothing; the asymmetry gate must fire and short-circuit Myers.
*/
@Test
public void shouldShortCircuitOnAsymmetricDisjointInputs_30kVs50() {
List<String> database = buildLines(30_000, "db-");
List<String> report = buildLines(50, "rp-");

long start = System.nanoTime();
int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
long elapsedMs = (System.nanoTime() - start) / 1_000_000;

assertThat(diff).hasSize(50);
assertThat(diff).containsOnly(0);
assertThat(elapsedMs).as("asymmetry gate must short-circuit Myers — wall time was " + elapsedMs + " ms")
.isLessThan(500L);
}

/**
* BOI worst case: 100 000-line DB file vs 100-line scanner delta with no
* overlapping content. Both the asymmetry gate and the cell gate apply;
* either is sufficient to short-circuit. Without a guard this took ~70 s
* on M-series hardware and ~19 minutes on the production cloud VM.
*/
@Test
public void shouldShortCircuitOnBoiScale_100kVs100Disjoint() {
List<String> database = buildLines(100_000, "db-");
List<String> report = buildLines(100, "rp-");

long start = System.nanoTime();
int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
long elapsedMs = (System.nanoTime() - start) / 1_000_000;

assertThat(diff).hasSize(100);
assertThat(diff).containsOnly(0);
assertThat(elapsedMs).as("BOI-scale guarded call must complete fast — wall time was " + elapsedMs + " ms")
.isLessThan(500L);
}

/**
* Symmetric fully-disjoint large input (5K x 5K): trim removes nothing, ratio
* is 1 so the asymmetry gate does not fire, but the cell gate (5 000 x 5 000
* = 25 M cells > 4 M threshold) must short-circuit.
*/
@Test
public void shouldShortCircuitOnLargeSymmetricDisjointInputs() {
List<String> database = buildLines(5_000, "db-");
List<String> report = buildLines(5_000, "rp-");

long start = System.nanoTime();
int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
long elapsedMs = (System.nanoTime() - start) / 1_000_000;

assertThat(diff).hasSize(5_000);
assertThat(diff).containsOnly(0);
assertThat(elapsedMs).as("cell gate must short-circuit Myers — wall time was " + elapsedMs + " ms")
.isLessThan(500L);
}

/**
* Small asymmetric input (4K x 50) — below {@code DIFF_ASYMMETRY_MIN_SIZE} so
* the asymmetry gate does NOT fire, and cells (200K) are well under the cell
* gate. Myers must still run and produce the correct (no-match) result.
*/
@Test
public void shouldRunMyersForSmallAsymmetricInputsBelowFloor() {
List<String> database = buildLines(4_000, "db-");
List<String> report = buildLines(50, "rp-");

long start = System.nanoTime();
int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
long elapsedMs = (System.nanoTime() - start) / 1_000_000;

assertThat(diff).hasSize(50);
assertThat(diff).containsOnly(0);
assertThat(elapsedMs).as("below floor and below cell gate — Myers must run normally, wall time was " + elapsedMs + " ms")
.isLessThan(5_000L);
}

private static List<String> buildLines(int n, String prefix) {
List<String> lines = new ArrayList<>(n);
for (int i = 0; i < n; i++) {
lines.add(prefix + String.format("%07d", i));
}
return lines;
}
}