From 680bb57978ba8ff8b0bb6bfdbc3dfd1c606bc1d4 Mon Sep 17 00:00:00 2001 From: Barys Yakavita Date: Wed, 13 May 2026 15:47:21 +0300 Subject: [PATCH 1/4] CD-8797 SourceLinesDiffFinder: prefix/suffix trim + asymmetry + cell gates The previous narrow fix in NewCoverageMeasuresStep (commit 82308bbde) only prevented one of six callers of NewLinesRepository.getNewLines() from warming the SCM cache. Once that caller was guarded, the unbounded Myers diff in SourceLinesDiffFinder.findMatchingLines() simply moved to whoever asked next - NewSizeMeasuresStep in this case, and any of the other four consumers (NewMaintainabilityMeasuresVisitor, IsNewLineReader, NewIssueClassifier, PullRequestTrackerExecution) thereafter. Fix the algorithm itself, not the call sites: 1. Trim common prefix/suffix on the line-hash inputs before invoking Myers. Standard speedup in production diff implementations; for the typical "large file with small PR delta" pattern this collapses to the small divergent core. Cost is O(min(N, M)) hash equality checks - milliseconds even for 100K-line inputs. 2. Apply a cell-product gate (4_000_000 cells) against the divergent core. Catches catastrophic shapes like symmetric fully-disjoint 5K x 5K (25 M cells) that prefix/suffix trim cannot reduce. 3. Apply an asymmetry-ratio gate (max/min > 100 when max >= 5 000) against the divergent core. Catches the EZ-Commit signature: small scanner delta against a large reference-branch file (e.g. 30K x 50 = ratio 600, 1.5 M cells - below the cell gate but forced quadratic by D >= N - M). When a gate fires, unmatched report lines are returned as zero - semantically identical to the existing dead DifferentiationFailedException catch path, which downstream consumers already tolerate. Tested cases (all gated cases return in <= 6 ms on M-series; full test results in the new SourceLinesDiffFinderTest entries): - 100K x 100K identical -> prefix trim, no Myers, identity map - 80K x 80K with 100-line mid diff -> trim leaves 100 x 100 core - 30K x 50 disjoint (your case) -> asymmetry gate fires - 100K x 100 disjoint (BOI scale) -> asymmetry + cell gate fire - 5K x 5K disjoint symmetric -> cell gate fires - 4K x 50 disjoint (below floor) -> Myers runs normally - all 10 existing golden tests -> preserved --- .../source/SourceLinesDiffFinder.java | 86 +++++++++- .../source/SourceLinesDiffFinderTest.java | 158 ++++++++++++++++++ 2 files changed, 238 insertions(+), 6 deletions(-) diff --git a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java index 543b09883b43..a31f33f6cbc4 100644 --- a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java +++ b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java @@ -29,13 +29,87 @@ public class SourceLinesDiffFinder { private static final Logger LOG = LoggerFactory.getLogger(SourceLinesDiffFinder.class); + /** + * Upper bound on {@code core_left * core_right} cells of work permitted inside + * {@link MyersDiff#buildPath(List, List)} after common-prefix / common-suffix trimming. + * Myers diff cost is O(D * (N + M)); when the divergent cores are large and disjoint, + * D approaches N + M and the algorithm collapses to O(N^2). 4 000 000 cells caps the + * worst-case in-memory work at roughly 1 s on production hardware. + */ + static final long DIFF_COMPLEXITY_THRESHOLD = 4_000_000L; + + /** + * Upper bound on {@code max(core_left, core_right) / min(core_left, core_right)} after + * common-prefix / common-suffix trimming. When one side is much larger than the other, + * the edit distance D is forced to be at least {@code |N - M|} and Myers diff has to + * touch ~D * (N + M) cells regardless of content. The asymmetric shape is the signature + * of "small scanner delta against a large reference-branch file" (e.g. ARM EZ-Commit + * sending 50 changed lines against a 30 000-line Salesforce metadata file). + */ + static final int DIFF_ASYMMETRY_RATIO = 100; + + /** + * Floor below which the asymmetry ratio is not enforced. For small files the absolute + * cost is negligible no matter the ratio (e.g. 100 x 1 has ratio 100 but completes in + * microseconds), so this avoids tripping the gate on trivial inputs. + */ + static final int DIFF_ASYMMETRY_MIN_SIZE = 5_000; + public int[] findMatchingLines(List left, List right) { - int[] index = new int[right.size()]; + int n = left.size(); + int m = right.size(); + int[] index = new int[m]; + + // 1. Trim common prefix (cheap: equal hashes only). Prefix lines map 1:1 to DB. + int prefix = 0; + int maxPrefix = Math.min(n, m); + while (prefix < maxPrefix && left.get(prefix).equals(right.get(prefix))) { + index[prefix] = prefix + 1; + prefix++; + } + + // 2. Trim common suffix. Suffix lines map 1:1 to the tail of the DB. + int suffix = 0; + int maxSuffix = Math.min(n, m) - prefix; + while (suffix < maxSuffix + && left.get(n - 1 - suffix).equals(right.get(m - 1 - suffix))) { + index[m - 1 - suffix] = n - suffix; + suffix++; + } + + int leftCore = n - prefix - suffix; + int rightCore = m - prefix - suffix; + + // 3. If either core is empty the remaining mapping is trivially zero (no possible + // matches) — return what prefix/suffix already produced. + if (leftCore == 0 || rightCore == 0) { + return index; + } + + // 4. Apply gates against the CORE sizes (not the raw inputs). We only refuse work + // that is *forced* to be expensive — never near-identical large files whose prefix + // and suffix trim away most of the bulk. + if ((long) leftCore * rightCore > DIFF_COMPLEXITY_THRESHOLD) { + LOG.warn("Skipping Myers diff: divergent core {}x{} (full input {}x{}) exceeds complexity threshold {}; treating unmatched report lines as new.", + leftCore, rightCore, n, m, DIFF_COMPLEXITY_THRESHOLD); + return index; + } + int maxCore = Math.max(leftCore, rightCore); + int minCore = Math.min(leftCore, rightCore); + if (maxCore >= DIFF_ASYMMETRY_MIN_SIZE && maxCore / minCore > DIFF_ASYMMETRY_RATIO) { + LOG.warn("Skipping Myers diff: asymmetric divergent core {}x{} (full input {}x{}, ratio {}) exceeds ratio threshold {}; treating unmatched report lines as new.", + leftCore, rightCore, n, m, maxCore / minCore, DIFF_ASYMMETRY_RATIO); + return index; + } + + // 5. Run Myers on the divergent cores only. + List leftCoreList = left.subList(prefix, prefix + leftCore); + List rightCoreList = right.subList(prefix, prefix + rightCore); - int dbLine = left.size(); - int reportLine = right.size(); + int dbLine = leftCore; + int reportLine = rightCore; try { - PathNode node = new MyersDiff().buildPath(left, right); + PathNode node = new MyersDiff().buildPath(leftCoreList, rightCoreList); while (node.prev != null) { PathNode prevNode = node.prev; @@ -46,9 +120,9 @@ public int[] findMatchingLines(List left, List right) { // removals dbLine -= (node.i - prevNode.i); } else { - // matches + // matches — translate core positions back into full-input coordinates for (int i = node.i; i > prevNode.i; i--) { - index[reportLine - 1] = dbLine; + index[prefix + reportLine - 1] = prefix + dbLine; reportLine--; dbLine--; } diff --git a/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java b/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java index 71d7ece039f0..489d93ec780e 100644 --- a/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java +++ b/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java @@ -254,4 +254,162 @@ public void shouldIgnoreDeletedLinesAtTheStartOfTheFile() { assertThat(diff).containsExactly(3, 4); } + + /** + * Large fully-identical inputs (100K x 100K) — prefix trim consumes everything, + * Myers diff is never invoked. Verifies that identity is returned via the trim path. + */ + @Test + public void shouldReturnIdentityForLargeIdenticalInputsViaPrefixTrim() { + int size = 100_000; + List database = buildLines(size, "line-"); + List report = buildLines(size, "line-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(size); + for (int i = 0; i < size; i++) { + assertThat(diff[i]).as("line " + i).isEqualTo(i + 1); + } + assertThat(elapsedMs).as("prefix trim must short-circuit Myers — wall time was " + elapsedMs + " ms") + .isLessThan(2_000L); + } + + /** + * Large near-identical inputs (80K x 80K, 100 lines changed in the middle). + * Prefix and suffix together trim ~79 800 lines; Myers diff runs only on the + * 100x100 core. Result must be correct AND fast. + */ + @Test + public void shouldRunMyersOnSmallCoreForLargeNearIdenticalInputs() { + int total = 80_000; + int prefixLen = 39_950; + int divergent = 100; + + List database = new ArrayList<>(total); + List report = new ArrayList<>(total); + for (int i = 0; i < prefixLen; i++) { + String shared = "shared-" + i; + database.add(shared); + report.add(shared); + } + for (int i = 0; i < divergent; i++) { + database.add("db-divergent-" + i); + report.add("rp-divergent-" + i); + } + int suffixLen = total - prefixLen - divergent; + for (int i = 0; i < suffixLen; i++) { + String shared = "tail-" + i; + database.add(shared); + report.add(shared); + } + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(total); + for (int i = 0; i < prefixLen; i++) { + assertThat(diff[i]).as("prefix line " + i).isEqualTo(i + 1); + } + for (int i = prefixLen; i < prefixLen + divergent; i++) { + assertThat(diff[i]).as("divergent line " + i).isZero(); + } + for (int i = prefixLen + divergent; i < total; i++) { + assertThat(diff[i]).as("suffix line " + i).isEqualTo(i + 1); + } + assertThat(elapsedMs).as("trim should leave only a 100x100 core — wall time was " + elapsedMs + " ms") + .isLessThan(2_000L); + } + + /** + * Asymmetric customer scenario: 30 000-line DB file vs 50-line scanner delta + * with no overlapping content (ARM EZ-Commit pattern). Prefix/suffix trim + * removes nothing; the asymmetry gate must fire and short-circuit Myers. + */ + @Test + public void shouldShortCircuitOnAsymmetricDisjointInputs_30kVs50() { + List database = buildLines(30_000, "db-"); + List report = buildLines(50, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(50); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("asymmetry gate must short-circuit Myers — wall time was " + elapsedMs + " ms") + .isLessThan(500L); + } + + /** + * BOI worst case: 100 000-line DB file vs 100-line scanner delta with no + * overlapping content. Both the asymmetry gate and the cell gate apply; + * either is sufficient to short-circuit. Without a guard this took ~70 s + * on M-series hardware and ~19 minutes on the production cloud VM. + */ + @Test + public void shouldShortCircuitOnBoiScale_100kVs100Disjoint() { + List database = buildLines(100_000, "db-"); + List report = buildLines(100, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(100); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("BOI-scale guarded call must complete fast — wall time was " + elapsedMs + " ms") + .isLessThan(500L); + } + + /** + * Symmetric fully-disjoint large input (5K x 5K): trim removes nothing, ratio + * is 1 so the asymmetry gate does not fire, but the cell gate (5 000 x 5 000 + * = 25 M cells > 4 M threshold) must short-circuit. + */ + @Test + public void shouldShortCircuitOnLargeSymmetricDisjointInputs() { + List database = buildLines(5_000, "db-"); + List report = buildLines(5_000, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(5_000); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("cell gate must short-circuit Myers — wall time was " + elapsedMs + " ms") + .isLessThan(500L); + } + + /** + * Small asymmetric input (4K x 50) — below {@code DIFF_ASYMMETRY_MIN_SIZE} so + * the asymmetry gate does NOT fire, and cells (200K) are well under the cell + * gate. Myers must still run and produce the correct (no-match) result. + */ + @Test + public void shouldRunMyersForSmallAsymmetricInputsBelowFloor() { + List database = buildLines(4_000, "db-"); + List report = buildLines(50, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(50); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("below floor and below cell gate — Myers must run normally, wall time was " + elapsedMs + " ms") + .isLessThan(5_000L); + } + + private static List buildLines(int n, String prefix) { + List lines = new ArrayList<>(n); + for (int i = 0; i < n; i++) { + lines.add(prefix + String.format("%07d", i)); + } + return lines; + } } From f920a75c69d727004d95a95f6ddc0624d3ca8992 Mon Sep 17 00:00:00 2001 From: Barys Yakavita Date: Thu, 14 May 2026 01:32:54 +0300 Subject: [PATCH 2/4] debug --- .../ce/task/projectanalysis/source/SourceLinesDiffFinder.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java index a31f33f6cbc4..14bfa8ca1fd9 100644 --- a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java +++ b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java @@ -86,6 +86,8 @@ public int[] findMatchingLines(List left, List right) { return index; } + LOG.info("left={}, right={}", leftCore, rightCore); + // 4. Apply gates against the CORE sizes (not the raw inputs). We only refuse work // that is *forced* to be expensive — never near-identical large files whose prefix // and suffix trim away most of the bulk. From 980c6497022485c84e9dff340b5dada06c538fc9 Mon Sep 17 00:00:00 2001 From: Barys Yakavita Date: Thu, 14 May 2026 09:23:42 +0300 Subject: [PATCH 3/4] debug --- .../ce/task/projectanalysis/source/SourceLinesDiffFinder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java index 14bfa8ca1fd9..ce49ead0cec1 100644 --- a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java +++ b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java @@ -80,14 +80,14 @@ public int[] findMatchingLines(List left, List right) { int leftCore = n - prefix - suffix; int rightCore = m - prefix - suffix; + LOG.warn("left={}, right={}", leftCore, rightCore); + // 3. If either core is empty the remaining mapping is trivially zero (no possible // matches) — return what prefix/suffix already produced. if (leftCore == 0 || rightCore == 0) { return index; } - LOG.info("left={}, right={}", leftCore, rightCore); - // 4. Apply gates against the CORE sizes (not the raw inputs). We only refuse work // that is *forced* to be expensive — never near-identical large files whose prefix // and suffix trim away most of the bulk. From 0757d69cd47ae6e526b9f956c9c92be2fcd48d74 Mon Sep 17 00:00:00 2001 From: Barys Yakavita Date: Thu, 14 May 2026 10:42:24 +0300 Subject: [PATCH 4/4] debug --- .../ce/task/projectanalysis/source/SourceLinesDiffFinder.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java index ce49ead0cec1..a31f33f6cbc4 100644 --- a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java +++ b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java @@ -80,8 +80,6 @@ public int[] findMatchingLines(List left, List right) { int leftCore = n - prefix - suffix; int rightCore = m - prefix - suffix; - LOG.warn("left={}, right={}", leftCore, rightCore); - // 3. If either core is empty the remaining mapping is trivially zero (no possible // matches) — return what prefix/suffix already produced. if (leftCore == 0 || rightCore == 0) {