diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 16bcb47..2a61e21 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI on: push: - branches: [main, "claude/**", "fix/**", "feat/**"] + branches: [main, "claude/**", "fix/**", "feat/**", "perf/**"] pull_request: branches: [main] workflow_dispatch: @@ -13,14 +13,23 @@ concurrency: jobs: backend: - name: Backend — Test + name: Backend — Test (shard ${{ matrix.shard }}) runs-on: ubuntu-latest permissions: checks: write contents: read + strategy: + fail-fast: false + matrix: + shard: [0, 1, 2, 3, 4, 5, 6, 7] defaults: run: working-directory: backend + env: + # Split the CQL-heavy suite across parallel runners; keep this in sync with the + # length of matrix.shard above. See build.gradle.kts for the hash-based selection. + TEST_SHARD_TOTAL: "8" + TEST_SHARD_INDEX: ${{ matrix.shard }} steps: - uses: actions/checkout@v4 - uses: actions/setup-java@v4 @@ -29,14 +38,25 @@ jobs: java-version: 21 - uses: gradle/actions/setup-gradle@v4 with: - cache-read-only: false - - name: Run backend tests + # Only one shard writes the shared Gradle cache to avoid concurrent-write + # contention; the rest read it. + cache-read-only: ${{ matrix.shard != 0 }} + - name: Run backend tests (shard ${{ matrix.shard }}/8) run: ./gradlew test --build-cache --no-daemon + - name: Per-class timings (shard balancing data) + if: always() + run: | + shopt -s nullglob + for f in build/test-results/test/*.xml; do + cls=$(basename "$f" .xml); cls=${cls#TEST-} + t=$(sed -n 's/.*]*[[:space:]]time="\([0-9.]*\)".*/\1/p' "$f" | head -1) + echo "TIMING ${t:-0} ${cls}" + done | sort -t' ' -k2 -gr - name: Publish test results uses: dorny/test-reporter@v1 if: always() with: - name: Backend Tests + name: Backend Tests (shard ${{ matrix.shard }}) path: backend/build/test-results/test/*.xml reporter: java-junit diff --git a/backend/build.gradle.kts b/backend/build.gradle.kts index e9e2730..d1b4926 100644 --- a/backend/build.gradle.kts +++ b/backend/build.gradle.kts @@ -1,3 +1,6 @@ +import org.gradle.api.file.FileTreeElement +import org.gradle.api.specs.Spec + plugins { java id("org.springframework.boot") version "3.3.5" @@ -60,9 +63,36 @@ dependencyManagement { tasks.withType { useJUnitPlatform() - // CI gets two forks so long-running Spring/Testcontainers classes can overlap - // without turning the runner into a noisy stampede. - maxParallelForks = if (System.getenv("CI") == "true") 2 else 1 + // CI forks 4-wide so heavy Spring/CQL/Testcontainers classes in a shard overlap + // (ubuntu-latest has 4 vCPUs). Override via GRADLE_TEST_FORKS. + maxParallelForks = System.getenv("GRADLE_TEST_FORKS")?.toIntOrNull() + ?: if (System.getenv("CI") == "true") 4 else 1 + // Cap per-fork heap so 4 JVMs + their Postgres containers fit the runner's RAM; + // prod runs the app on 768m, so 1.5g per test fork is ample. + if (System.getenv("CI") == "true") { + maxHeapSize = "1536m" + } + + // Optional CI matrix sharding: split the test classes across parallel runner jobs + // by a stable path hash, so each class runs in exactly one shard and the union of + // shards 0..TEST_SHARD_TOTAL-1 covers the whole suite. This is the lever that cuts + // the CQL-heavy backend suite from ~44 min on one runner to a few minutes across + // several. With no shard env set (local runs), the full suite runs as before. + val shardTotal = System.getenv("TEST_SHARD_TOTAL")?.toIntOrNull() + val shardIndex = System.getenv("TEST_SHARD_INDEX")?.toIntOrNull() + if (shardTotal != null && shardTotal > 1 && shardIndex != null) { + // FileTreeElement.path is always '/'-separated and relative to the test + // classes root, so the hash is stable across OSes. Directories must pass so + // the tree is traversed into; only .class candidates are assigned to a shard. + include(Spec { element -> + element.isDirectory || + Math.floorMod(element.path.hashCode(), shardTotal) == shardIndex + }) + doFirst { + logger.lifecycle("Backend test shard $shardIndex/$shardTotal active") + } + } + // Keep binary in-progress results outside the OneDrive tree so sync cannot // race against Gradle's rename of these short-lived files (NoSuchFileException). binaryResultsDirectory.set( diff --git a/backend/src/test/java/com/workwell/caseflow/CaseFlowRerunIntegrationTest.java b/backend/src/test/java/com/workwell/caseflow/CaseFlowRerunIntegrationTest.java index 5d724e0..c214470 100644 --- a/backend/src/test/java/com/workwell/caseflow/CaseFlowRerunIntegrationTest.java +++ b/backend/src/test/java/com/workwell/caseflow/CaseFlowRerunIntegrationTest.java @@ -7,13 +7,15 @@ import java.util.List; import java.util.Map; import java.util.UUID; -import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.jdbc.core.JdbcTemplate; @SpringBootTest +@TestInstance(TestInstance.Lifecycle.PER_CLASS) class CaseFlowRerunIntegrationTest extends AbstractIntegrationTest { @Autowired @@ -25,7 +27,10 @@ class CaseFlowRerunIntegrationTest extends AbstractIntegrationTest { @Autowired private JdbcTemplate jdbcTemplate; - @BeforeEach + // Each test targets a case of a distinct outcome type (COMPLIANT/EXCLUDED/DUE_SOON/ + // OVERDUE/MISSING_DATA) and verifies rerun behavior on it; the targets don't overlap, + // so one shared population run is enough instead of a full run before each of 5 tests. + @BeforeAll void seedData() { jdbcTemplate.execute("TRUNCATE TABLE runs, outcomes, cases, case_actions, run_logs, audit_events, outreach_records, scheduled_appointments, waivers, evidence_attachments CASCADE"); allProgramsRunService.runAllPrograms("All Programs", "admin@workwell.dev"); diff --git a/backend/src/test/java/com/workwell/web/EvidenceAccessIntegrationTest.java b/backend/src/test/java/com/workwell/web/EvidenceAccessIntegrationTest.java index 31083c3..caeedb0 100644 --- a/backend/src/test/java/com/workwell/web/EvidenceAccessIntegrationTest.java +++ b/backend/src/test/java/com/workwell/web/EvidenceAccessIntegrationTest.java @@ -19,8 +19,9 @@ import java.util.Comparator; import java.util.Map; import java.util.UUID; -import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; import org.springframework.boot.test.context.SpringBootTest; @@ -37,6 +38,7 @@ "workwell.auth.jwt-secret=test-secret-for-evidence-security" }) @AutoConfigureMockMvc +@TestInstance(TestInstance.Lifecycle.PER_CLASS) class EvidenceAccessIntegrationTest extends AbstractIntegrationTest { private static final Path evidenceRoot = createEvidenceRoot(); @@ -61,8 +63,12 @@ static void evidenceProperties(DynamicPropertyRegistry registry) { @Autowired private ObjectMapper objectMapper; - @BeforeEach - void resetState() throws Exception { + // Evidence access/role tests are read-only against the seeded population: each test + // uploads its own attachment (unique id) and filters audit by that id, so a single + // population run shared across the class is sufficient. This drops the class from + // ~14 full-population runs (~17 min) to one (~90s). + @BeforeAll + void seedPopulationOnce() throws Exception { jdbcTemplate.execute("TRUNCATE TABLE runs, outcomes, cases, case_actions, run_logs, audit_events, evidence_attachments, outreach_records, scheduled_appointments, waivers CASCADE"); deleteEvidenceFiles(); Files.createDirectories(evidenceRoot);