diff --git a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java index 543b09883b43..a31f33f6cbc4 100644 --- a/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java +++ b/server/sonar-ce-task-projectanalysis/src/main/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinder.java @@ -29,13 +29,87 @@ public class SourceLinesDiffFinder { private static final Logger LOG = LoggerFactory.getLogger(SourceLinesDiffFinder.class); + /** + * Upper bound on {@code core_left * core_right} cells of work permitted inside + * {@link MyersDiff#buildPath(List, List)} after common-prefix / common-suffix trimming. + * Myers diff cost is O(D * (N + M)); when the divergent cores are large and disjoint, + * D approaches N + M and the algorithm collapses to O(N^2). 4 000 000 cells caps the + * worst-case in-memory work at roughly 1 s on production hardware. + */ + static final long DIFF_COMPLEXITY_THRESHOLD = 4_000_000L; + + /** + * Upper bound on {@code max(core_left, core_right) / min(core_left, core_right)} after + * common-prefix / common-suffix trimming. When one side is much larger than the other, + * the edit distance D is forced to be at least {@code |N - M|} and Myers diff has to + * touch ~D * (N + M) cells regardless of content. The asymmetric shape is the signature + * of "small scanner delta against a large reference-branch file" (e.g. ARM EZ-Commit + * sending 50 changed lines against a 30 000-line Salesforce metadata file). + */ + static final int DIFF_ASYMMETRY_RATIO = 100; + + /** + * Floor below which the asymmetry ratio is not enforced. For small files the absolute + * cost is negligible no matter the ratio (e.g. 100 x 1 has ratio 100 but completes in + * microseconds), so this avoids tripping the gate on trivial inputs. + */ + static final int DIFF_ASYMMETRY_MIN_SIZE = 5_000; + public int[] findMatchingLines(List left, List right) { - int[] index = new int[right.size()]; + int n = left.size(); + int m = right.size(); + int[] index = new int[m]; + + // 1. Trim common prefix (cheap: equal hashes only). Prefix lines map 1:1 to DB. + int prefix = 0; + int maxPrefix = Math.min(n, m); + while (prefix < maxPrefix && left.get(prefix).equals(right.get(prefix))) { + index[prefix] = prefix + 1; + prefix++; + } + + // 2. Trim common suffix. Suffix lines map 1:1 to the tail of the DB. + int suffix = 0; + int maxSuffix = Math.min(n, m) - prefix; + while (suffix < maxSuffix + && left.get(n - 1 - suffix).equals(right.get(m - 1 - suffix))) { + index[m - 1 - suffix] = n - suffix; + suffix++; + } + + int leftCore = n - prefix - suffix; + int rightCore = m - prefix - suffix; + + // 3. If either core is empty the remaining mapping is trivially zero (no possible + // matches) — return what prefix/suffix already produced. + if (leftCore == 0 || rightCore == 0) { + return index; + } + + // 4. Apply gates against the CORE sizes (not the raw inputs). We only refuse work + // that is *forced* to be expensive — never near-identical large files whose prefix + // and suffix trim away most of the bulk. + if ((long) leftCore * rightCore > DIFF_COMPLEXITY_THRESHOLD) { + LOG.warn("Skipping Myers diff: divergent core {}x{} (full input {}x{}) exceeds complexity threshold {}; treating unmatched report lines as new.", + leftCore, rightCore, n, m, DIFF_COMPLEXITY_THRESHOLD); + return index; + } + int maxCore = Math.max(leftCore, rightCore); + int minCore = Math.min(leftCore, rightCore); + if (maxCore >= DIFF_ASYMMETRY_MIN_SIZE && maxCore / minCore > DIFF_ASYMMETRY_RATIO) { + LOG.warn("Skipping Myers diff: asymmetric divergent core {}x{} (full input {}x{}, ratio {}) exceeds ratio threshold {}; treating unmatched report lines as new.", + leftCore, rightCore, n, m, maxCore / minCore, DIFF_ASYMMETRY_RATIO); + return index; + } + + // 5. Run Myers on the divergent cores only. + List leftCoreList = left.subList(prefix, prefix + leftCore); + List rightCoreList = right.subList(prefix, prefix + rightCore); - int dbLine = left.size(); - int reportLine = right.size(); + int dbLine = leftCore; + int reportLine = rightCore; try { - PathNode node = new MyersDiff().buildPath(left, right); + PathNode node = new MyersDiff().buildPath(leftCoreList, rightCoreList); while (node.prev != null) { PathNode prevNode = node.prev; @@ -46,9 +120,9 @@ public int[] findMatchingLines(List left, List right) { // removals dbLine -= (node.i - prevNode.i); } else { - // matches + // matches — translate core positions back into full-input coordinates for (int i = node.i; i > prevNode.i; i--) { - index[reportLine - 1] = dbLine; + index[prefix + reportLine - 1] = prefix + dbLine; reportLine--; dbLine--; } diff --git a/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java b/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java index 71d7ece039f0..489d93ec780e 100644 --- a/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java +++ b/server/sonar-ce-task-projectanalysis/src/test/java/org/sonar/ce/task/projectanalysis/source/SourceLinesDiffFinderTest.java @@ -254,4 +254,162 @@ public void shouldIgnoreDeletedLinesAtTheStartOfTheFile() { assertThat(diff).containsExactly(3, 4); } + + /** + * Large fully-identical inputs (100K x 100K) — prefix trim consumes everything, + * Myers diff is never invoked. Verifies that identity is returned via the trim path. + */ + @Test + public void shouldReturnIdentityForLargeIdenticalInputsViaPrefixTrim() { + int size = 100_000; + List database = buildLines(size, "line-"); + List report = buildLines(size, "line-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(size); + for (int i = 0; i < size; i++) { + assertThat(diff[i]).as("line " + i).isEqualTo(i + 1); + } + assertThat(elapsedMs).as("prefix trim must short-circuit Myers — wall time was " + elapsedMs + " ms") + .isLessThan(2_000L); + } + + /** + * Large near-identical inputs (80K x 80K, 100 lines changed in the middle). + * Prefix and suffix together trim ~79 800 lines; Myers diff runs only on the + * 100x100 core. Result must be correct AND fast. + */ + @Test + public void shouldRunMyersOnSmallCoreForLargeNearIdenticalInputs() { + int total = 80_000; + int prefixLen = 39_950; + int divergent = 100; + + List database = new ArrayList<>(total); + List report = new ArrayList<>(total); + for (int i = 0; i < prefixLen; i++) { + String shared = "shared-" + i; + database.add(shared); + report.add(shared); + } + for (int i = 0; i < divergent; i++) { + database.add("db-divergent-" + i); + report.add("rp-divergent-" + i); + } + int suffixLen = total - prefixLen - divergent; + for (int i = 0; i < suffixLen; i++) { + String shared = "tail-" + i; + database.add(shared); + report.add(shared); + } + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(total); + for (int i = 0; i < prefixLen; i++) { + assertThat(diff[i]).as("prefix line " + i).isEqualTo(i + 1); + } + for (int i = prefixLen; i < prefixLen + divergent; i++) { + assertThat(diff[i]).as("divergent line " + i).isZero(); + } + for (int i = prefixLen + divergent; i < total; i++) { + assertThat(diff[i]).as("suffix line " + i).isEqualTo(i + 1); + } + assertThat(elapsedMs).as("trim should leave only a 100x100 core — wall time was " + elapsedMs + " ms") + .isLessThan(2_000L); + } + + /** + * Asymmetric customer scenario: 30 000-line DB file vs 50-line scanner delta + * with no overlapping content (ARM EZ-Commit pattern). Prefix/suffix trim + * removes nothing; the asymmetry gate must fire and short-circuit Myers. + */ + @Test + public void shouldShortCircuitOnAsymmetricDisjointInputs_30kVs50() { + List database = buildLines(30_000, "db-"); + List report = buildLines(50, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(50); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("asymmetry gate must short-circuit Myers — wall time was " + elapsedMs + " ms") + .isLessThan(500L); + } + + /** + * BOI worst case: 100 000-line DB file vs 100-line scanner delta with no + * overlapping content. Both the asymmetry gate and the cell gate apply; + * either is sufficient to short-circuit. Without a guard this took ~70 s + * on M-series hardware and ~19 minutes on the production cloud VM. + */ + @Test + public void shouldShortCircuitOnBoiScale_100kVs100Disjoint() { + List database = buildLines(100_000, "db-"); + List report = buildLines(100, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(100); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("BOI-scale guarded call must complete fast — wall time was " + elapsedMs + " ms") + .isLessThan(500L); + } + + /** + * Symmetric fully-disjoint large input (5K x 5K): trim removes nothing, ratio + * is 1 so the asymmetry gate does not fire, but the cell gate (5 000 x 5 000 + * = 25 M cells > 4 M threshold) must short-circuit. + */ + @Test + public void shouldShortCircuitOnLargeSymmetricDisjointInputs() { + List database = buildLines(5_000, "db-"); + List report = buildLines(5_000, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(5_000); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("cell gate must short-circuit Myers — wall time was " + elapsedMs + " ms") + .isLessThan(500L); + } + + /** + * Small asymmetric input (4K x 50) — below {@code DIFF_ASYMMETRY_MIN_SIZE} so + * the asymmetry gate does NOT fire, and cells (200K) are well under the cell + * gate. Myers must still run and produce the correct (no-match) result. + */ + @Test + public void shouldRunMyersForSmallAsymmetricInputsBelowFloor() { + List database = buildLines(4_000, "db-"); + List report = buildLines(50, "rp-"); + + long start = System.nanoTime(); + int[] diff = new SourceLinesDiffFinder().findMatchingLines(database, report); + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + assertThat(diff).hasSize(50); + assertThat(diff).containsOnly(0); + assertThat(elapsedMs).as("below floor and below cell gate — Myers must run normally, wall time was " + elapsedMs + " ms") + .isLessThan(5_000L); + } + + private static List buildLines(int n, String prefix) { + List lines = new ArrayList<>(n); + for (int i = 0; i < n; i++) { + lines.add(prefix + String.format("%07d", i)); + } + return lines; + } }