codescan-io · borisbsu · May 13, 2026 · May 13, 2026 · May 14, 2026 · May 14, 2026
diff --git a/...nalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java b/...nalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java
@@ -29,13 +29,87 @@
 public class SourceLinesDiffFinder {
   private static final Logger LOG = LoggerFactory.getLogger(SourceLinesDiffFinder.class);
 
+  /**
+   * Upper bound on {@code core_left * core_right} cells of work permitted inside
+   * {@link MyersDiff#buildPath(List, List)} after common-prefix / common-suffix trimming.
+   * Myers diff cost is O(D * (N + M)); when the divergent cores are large and disjoint,
+   * D approaches N + M and the algorithm collapses to O(N^2). 4 000 000 cells caps the
+   * worst-case in-memory work at roughly 1 s on production hardware.
+   */
+  static final long DIFF_COMPLEXITY_THRESHOLD = 4_000_000L;
+
+  /**
+   * Upper bound on {@code max(core_left, core_right) / min(core_left, core_right)} after
+   * common-prefix / common-suffix trimming. When one side is much larger than the other,
+   * the edit distance D is forced to be at least {@code |N - M|} and Myers diff has to
+   * touch ~D * (N + M) cells regardless of content. The asymmetric shape is the signature
+   * of "small scanner delta against a large reference-branch file" (e.g. ARM EZ-Commit
+   * sending 50 changed lines against a 30 000-line Salesforce metadata file).
+   */
+  static final int DIFF_ASYMMETRY_RATIO = 100;
+
+  /**
+   * Floor below which the asymmetry ratio is not enforced. For small files the absolute
+   * cost is negligible no matter the ratio (e.g. 100 x 1 has ratio 100 but completes in
+   * microseconds), so this avoids tripping the gate on trivial inputs.
+   */
+  static final int DIFF_ASYMMETRY_MIN_SIZE = 5_000;
+
   public int[] findMatchingLines(List<String> left, List<String> right) {
-    int[] index = new int[right.size()];
+    int n = left.size();
+    int m = right.size();
+    int[] index = new int[m];
+
+    // 1. Trim common prefix (cheap: equal hashes only). Prefix lines map 1:1 to DB.
+    int prefix = 0;
+    int maxPrefix = Math.min(n, m);
+    while (prefix < maxPrefix && left.get(prefix).equals(right.get(prefix))) {
+      index[prefix] = prefix + 1;
+      prefix++;
+    }
+
+    // 2. Trim common suffix. Suffix lines map 1:1 to the tail of the DB.
+    int suffix = 0;
+    int maxSuffix = Math.min(n, m) - prefix;
+    while (suffix < maxSuffix
+      && left.get(n - 1 - suffix).equals(right.get(m - 1 - suffix))) {
+      index[m - 1 - suffix] = n - suffix;
+      suffix++;
+    }
+
+    int leftCore = n - prefix - suffix;
+    int rightCore = m - prefix - suffix;
+
+    // 3. If either core is empty the remaining mapping is trivially zero (no possible
+    // matches) — return what prefix/suffix already produced.
+    if (leftCore == 0 || rightCore == 0) {
+      return index;
+    }
+
+    // 4. Apply gates against the CORE sizes (not the raw inputs). We only refuse work
+    // that is *forced* to be expensive — never near-identical large files whose prefix
+    // and suffix trim away most of the bulk.
+    if ((long) leftCore * rightCore > DIFF_COMPLEXITY_THRESHOLD) {
+      LOG.warn("Skipping Myers diff: divergent core {}x{} (full input {}x{}) exceeds complexity threshold {}; treating unmatched report lines as new.",
+        leftCore, rightCore, n, m, DIFF_COMPLEXITY_THRESHOLD);
+      return index;
+    }
+    int maxCore = Math.max(leftCore, rightCore);
+    int minCore = Math.min(leftCore, rightCore);
+    if (maxCore >= DIFF_ASYMMETRY_MIN_SIZE && maxCore / minCore > DIFF_ASYMMETRY_RATIO) {
+      LOG.warn("Skipping Myers diff: asymmetric divergent core {}x{} (full input {}x{}, ratio {}) exceeds ratio threshold {}; treating unmatched report lines as new.",
+        leftCore, rightCore, n, m, maxCore / minCore, DIFF_ASYMMETRY_RATIO);
+      return index;
+    }
+
+    // 5. Run Myers on the divergent cores only.
+    List<String> leftCoreList = left.subList(prefix, prefix + leftCore);
+    List<String> rightCoreList = right.subList(prefix, prefix + rightCore);
 
-    int dbLine = left.size();
-    int reportLine = right.size();
+    int dbLine = leftCore;
+    int reportLine = rightCore;
     try {
-      PathNode node = new MyersDiff<String>().buildPath(left, right);
+      PathNode node = new MyersDiff<String>().buildPath(leftCoreList, rightCoreList);
 
       while (node.prev != null) {
         PathNode prevNode = node.prev;
@@ -46,9 +120,9 @@ public int[] findMatchingLines(List<String> left, List<String> right) {
           // removals
           dbLine -= (node.i - prevNode.i);
         } else {
-          // matches
+          // matches — translate core positions back into full-input coordinates
           for (int i = node.i; i > prevNode.i; i--) {
-            index[reportLine - 1] = dbLine;
+            index[prefix + reportLine - 1] = prefix + dbLine;
             reportLine--;
             dbLine--;
           }

diff --git a/...sis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java b/...sis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java
@@ -254,4 +254,162 @@ public void shouldIgnoreDeletedLinesAtTheStartOfTheFile() {
 
     assertThat(diff).containsExactly(3, 4);
   }
+
+  /**
+   * Large fully-identical inputs (100K x 100K) — prefix trim consumes everything,
+   * Myers diff is never invoked. Verifies that identity is returned via the trim path.
+   */
+  @Test
+  public void shouldReturnIdentityForLargeIdenticalInputsViaPrefixTrim() {
+    int size = 100_000;
+    List<String> database = buildLines(size, "line-");
+    List<String> report = buildLines(size, "line-");
+
+    long start = System.nanoTime();
+    int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
+    long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+    assertThat(diff).hasSize(size);
+    for (int i = 0; i < size; i++) {
+      assertThat(diff[i]).as("line " + i).isEqualTo(i + 1);
+    }
+    assertThat(elapsedMs).as("prefix trim must short-circuit Myers — wall time was " + elapsedMs + " ms")
+      .isLessThan(2_000L);
+  }
+
+  /**
+   * Large near-identical inputs (80K x 80K, 100 lines changed in the middle).
+   * Prefix and suffix together trim ~79 800 lines; Myers diff runs only on the
+   * 100x100 core. Result must be correct AND fast.
+   */
+  @Test
+  public void shouldRunMyersOnSmallCoreForLargeNearIdenticalInputs() {
+    int total = 80_000;
+    int prefixLen = 39_950;
+    int divergent = 100;
+
+    List<String> database = new ArrayList<>(total);
+    List<String> report = new ArrayList<>(total);
+    for (int i = 0; i < prefixLen; i++) {
+      String shared = "shared-" + i;
+      database.add(shared);
+      report.add(shared);
+    }
+    for (int i = 0; i < divergent; i++) {
+      database.add("db-divergent-" + i);
+      report.add("rp-divergent-" + i);
+    }
+    int suffixLen = total - prefixLen - divergent;
+    for (int i = 0; i < suffixLen; i++) {
+      String shared = "tail-" + i;
+      database.add(shared);
+      report.add(shared);
+    }
+
+    long start = System.nanoTime();
+    int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
+    long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+    assertThat(diff).hasSize(total);
+    for (int i = 0; i < prefixLen; i++) {
+      assertThat(diff[i]).as("prefix line " + i).isEqualTo(i + 1);
+    }
+    for (int i = prefixLen; i < prefixLen + divergent; i++) {
+      assertThat(diff[i]).as("divergent line " + i).isZero();
+    }
+    for (int i = prefixLen + divergent; i < total; i++) {
+      assertThat(diff[i]).as("suffix line " + i).isEqualTo(i + 1);
+    }
+    assertThat(elapsedMs).as("trim should leave only a 100x100 core — wall time was " + elapsedMs + " ms")
+      .isLessThan(2_000L);
+  }
+
+  /**
+   * Asymmetric customer scenario: 30 000-line DB file vs 50-line scanner delta
+   * with no overlapping content (ARM EZ-Commit pattern). Prefix/suffix trim
+   * removes nothing; the asymmetry gate must fire and short-circuit Myers.
+   */
+  @Test
+  public void shouldShortCircuitOnAsymmetricDisjointInputs_30kVs50() {
+    List<String> database = buildLines(30_000, "db-");
+    List<String> report = buildLines(50, "rp-");
+
+    long start = System.nanoTime();
+    int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
+    long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+    assertThat(diff).hasSize(50);
+    assertThat(diff).containsOnly(0);
+    assertThat(elapsedMs).as("asymmetry gate must short-circuit Myers — wall time was " + elapsedMs + " ms")
+      .isLessThan(500L);
+  }
+
+  /**
+   * BOI worst case: 100 000-line DB file vs 100-line scanner delta with no
+   * overlapping content. Both the asymmetry gate and the cell gate apply;
+   * either is sufficient to short-circuit. Without a guard this took ~70 s
+   * on M-series hardware and ~19 minutes on the production cloud VM.
+   */
+  @Test
+  public void shouldShortCircuitOnBoiScale_100kVs100Disjoint() {
+    List<String> database = buildLines(100_000, "db-");
+    List<String> report = buildLines(100, "rp-");
+
+    long start = System.nanoTime();
+    int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
+    long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+    assertThat(diff).hasSize(100);
+    assertThat(diff).containsOnly(0);
+    assertThat(elapsedMs).as("BOI-scale guarded call must complete fast — wall time was " + elapsedMs + " ms")
+      .isLessThan(500L);
+  }
+
+  /**
+   * Symmetric fully-disjoint large input (5K x 5K): trim removes nothing, ratio
+   * is 1 so the asymmetry gate does not fire, but the cell gate (5 000 x 5 000
+   * = 25 M cells > 4 M threshold) must short-circuit.
+   */
+  @Test
+  public void shouldShortCircuitOnLargeSymmetricDisjointInputs() {
+    List<String> database = buildLines(5_000, "db-");
+    List<String> report = buildLines(5_000, "rp-");
+
+    long start = System.nanoTime();
+    int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
+    long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+    assertThat(diff).hasSize(5_000);
+    assertThat(diff).containsOnly(0);
+    assertThat(elapsedMs).as("cell gate must short-circuit Myers — wall time was " + elapsedMs + " ms")
+      .isLessThan(500L);
+  }
+
+  /**
+   * Small asymmetric input (4K x 50) — below {@code DIFF_ASYMMETRY_MIN_SIZE} so
+   * the asymmetry gate does NOT fire, and cells (200K) are well under the cell
+   * gate. Myers must still run and produce the correct (no-match) result.
+   */
+  @Test
+  public void shouldRunMyersForSmallAsymmetricInputsBelowFloor() {
+    List<String> database = buildLines(4_000, "db-");
+    List<String> report = buildLines(50, "rp-");
+
+    long start = System.nanoTime();
+    int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report);
+    long elapsedMs = (System.nanoTime() - start) / 1_000_000;
+
+    assertThat(diff).hasSize(50);
+    assertThat(diff).containsOnly(0);
+    assertThat(elapsedMs).as("below floor and below cell gate — Myers must run normally, wall time was " + elapsedMs + " ms")
+      .isLessThan(5_000L);
+  }
+
+  private static List<String> buildLines(int n, String prefix) {
+    List<String> lines = new ArrayList<>(n);
+    for (int i = 0; i < n; i++) {
+      lines.add(prefix + String.format("%07d", i));
+    }
+    return lines;
+  }
 }